1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=instcombine -S | FileCheck %s 3 4; 5; UNDEF Elts 6; 7 8define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) { 9; CHECK-LABEL: @undef_pmuludq_128( 10; CHECK-NEXT: ret <2 x i64> zeroinitializer 11; 12 %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef) 13 ret <2 x i64> %1 14} 15 16define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { 17; CHECK-LABEL: @undef_pmuludq_256( 18; CHECK-NEXT: ret <4 x i64> zeroinitializer 19; 20 %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef) 21 ret <4 x i64> %1 22} 23 24define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) { 25; CHECK-LABEL: @undef_pmuludq_512( 26; CHECK-NEXT: ret <8 x i64> zeroinitializer 27; 28 %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef) 29 ret <8 x i64> %1 30} 31 32define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) { 33; CHECK-LABEL: @undef_pmuldq_128( 34; CHECK-NEXT: ret <2 x i64> zeroinitializer 35; 36 %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef) 37 ret <2 x i64> %1 38} 39 40define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) { 41; CHECK-LABEL: @undef_pmuldq_256( 42; CHECK-NEXT: ret <4 x i64> zeroinitializer 43; 44 %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef) 45 ret <4 x i64> %1 46} 47 48define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) { 49; CHECK-LABEL: @undef_pmuldq_512( 50; CHECK-NEXT: ret <8 x i64> zeroinitializer 51; 52 %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef) 53 ret <8 x i64> %1 54} 55 56define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) { 57; CHECK-LABEL: @undef_zero_pmuludq_128( 58; CHECK-NEXT: ret <2 x i64> zeroinitializer 59; 60 %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer) 61 ret <2 x i64> %1 62} 63 64define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { 65; CHECK-LABEL: @undef_zero_pmuludq_256( 66; CHECK-NEXT: ret <4 x i64> zeroinitializer 67; 68 %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef) 69 ret <4 x i64> %1 70} 71 72define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) { 73; CHECK-LABEL: @undef_zero_pmuludq_512( 74; CHECK-NEXT: ret <8 x i64> zeroinitializer 75; 76 %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer) 77 ret <8 x i64> %1 78} 79 80define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) { 81; CHECK-LABEL: @undef_zero_pmuldq_128( 82; CHECK-NEXT: ret <2 x i64> zeroinitializer 83; 84 %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef) 85 ret <2 x i64> %1 86} 87 88define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) { 89; CHECK-LABEL: @undef_zero_pmuldq_256( 90; CHECK-NEXT: ret <4 x i64> zeroinitializer 91; 92 %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer) 93 ret <4 x i64> %1 94} 95 96define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) { 97; CHECK-LABEL: @undef_zero_pmuldq_512( 98; CHECK-NEXT: ret <8 x i64> zeroinitializer 99; 100 %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef) 101 ret <8 x i64> %1 102} 103 104; 105; Constant Folding 106; 107 108define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) { 109; CHECK-LABEL: @fold_pmuludq_128( 110; CHECK-NEXT: ret <2 x i64> <i64 9223372030412324865, i64 4294967295> 111; 112 %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>) 113 ret <2 x i64> %1 114} 115 116define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { 117; CHECK-LABEL: @fold_pmuludq_256( 118; CHECK-NEXT: ret <4 x i64> zeroinitializer 119; 120 %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer) 121 ret <4 x i64> %1 122} 123 124define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) { 125; CHECK-LABEL: @fold_pmuludq_512( 126; CHECK-NEXT: ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360> 127; 128 %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>) 129 ret <8 x i64> %1 130} 131 132define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) { 133; CHECK-LABEL: @fold_pmuldq_128( 134; CHECK-NEXT: ret <2 x i64> <i64 0, i64 2> 135; 136 %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>) 137 ret <2 x i64> %1 138} 139 140define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) { 141; CHECK-LABEL: @fold_pmuldq_256( 142; CHECK-NEXT: ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328> 143; 144 %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>) 145 ret <4 x i64> %1 146} 147 148define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) { 149; CHECK-LABEL: @fold_pmuldq_512( 150; CHECK-NEXT: ret <8 x i64> zeroinitializer 151; 152 %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>) 153 ret <8 x i64> %1 154} 155 156; 157; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required. 158; 159 160define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) { 161; CHECK-LABEL: @test_demanded_elts_pmuludq_128( 162; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 poison, i32 poison> 163; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 poison, i32 poison> 164; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64> 165; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64> 166; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP3]], <i64 4294967295, i64 poison> 167; CHECK-NEXT: [[TMP6:%.*]] = and <2 x i64> [[TMP4]], <i64 4294967295, i64 poison> 168; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <2 x i64> [[TMP5]], [[TMP6]] 169; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> zeroinitializer 170; CHECK-NEXT: ret <2 x i64> [[TMP8]] 171; 172 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 173 %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 174 %3 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %1, <4 x i32> %2) 175 %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer 176 ret <2 x i64> %4 177} 178 179define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) { 180; CHECK-LABEL: @test_demanded_elts_pmuludq_256( 181; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 182; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 183; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64> 184; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64> 185; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP3]], splat (i64 4294967295) 186; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[TMP4]], splat (i64 4294967295) 187; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i64> [[TMP5]], [[TMP6]] 188; CHECK-NEXT: ret <4 x i64> [[TMP7]] 189; 190 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 191 %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 192 %3 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %1, <8 x i32> %2) 193 ret <4 x i64> %3 194} 195 196define <8 x i64> @test_demanded_elts_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) { 197; CHECK-LABEL: @test_demanded_elts_pmuludq_512( 198; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 199; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 200; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> 201; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> 202; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP3]], splat (i64 4294967295) 203; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295) 204; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <8 x i64> [[TMP5]], [[TMP6]] 205; CHECK-NEXT: ret <8 x i64> [[TMP7]] 206; 207 %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 208 %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 209 %3 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %1, <16 x i32> %2) 210 ret <8 x i64> %3 211} 212 213define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) { 214; CHECK-LABEL: @test_demanded_elts_pmuldq_128( 215; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 216; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 217; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64> 218; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64> 219; CHECK-NEXT: [[TMP5:%.*]] = shl <2 x i64> [[TMP3]], splat (i64 32) 220; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <2 x i64> [[TMP5]], splat (i64 32) 221; CHECK-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP4]], splat (i64 32) 222; CHECK-NEXT: [[TMP8:%.*]] = ashr exact <2 x i64> [[TMP7]], splat (i64 32) 223; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <2 x i64> [[TMP6]], [[TMP8]] 224; CHECK-NEXT: ret <2 x i64> [[TMP9]] 225; 226 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 227 %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 228 %3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %1, <4 x i32> %2) 229 ret <2 x i64> %3 230} 231 232define <4 x i64> @test_demanded_elts_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) { 233; CHECK-LABEL: @test_demanded_elts_pmuldq_256( 234; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 235; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> poison, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 236; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64> 237; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64> 238; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP3]], splat (i64 32) 239; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], splat (i64 32) 240; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i64> [[TMP4]], splat (i64 32) 241; CHECK-NEXT: [[TMP8:%.*]] = ashr exact <4 x i64> [[TMP7]], splat (i64 32) 242; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i64> [[TMP6]], [[TMP8]] 243; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 3, i32 3> 244; CHECK-NEXT: ret <4 x i64> [[TMP10]] 245; 246 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 247 %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 248 %3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %1, <8 x i32> %2) 249 %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3> 250 ret <4 x i64> %4 251} 252 253define <8 x i64> @test_demanded_elts_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) { 254; CHECK-LABEL: @test_demanded_elts_pmuldq_512( 255; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> poison, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 256; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> poison, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 257; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64> 258; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64> 259; CHECK-NEXT: [[TMP5:%.*]] = shl <8 x i64> [[TMP3]], splat (i64 32) 260; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <8 x i64> [[TMP5]], splat (i64 32) 261; CHECK-NEXT: [[TMP7:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32) 262; CHECK-NEXT: [[TMP8:%.*]] = ashr exact <8 x i64> [[TMP7]], splat (i64 32) 263; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP8]] 264; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7> 265; CHECK-NEXT: ret <8 x i64> [[TMP10]] 266; 267 %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 268 %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 269 %3 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %1, <16 x i32> %2) 270 %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7> 271 ret <8 x i64> %4 272} 273 274declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone 275declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone 276 277declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone 278declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone 279 280declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) nounwind readnone 281declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) nounwind readnone 282