1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -passes=instcombine -mtriple=x86_64-unknown-unknown -S | FileCheck %s 3target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 4 5; Verify that instcombine is able to fold identity shuffles. 6 7define <4 x float> @identity_test_vpermilvar_ps(<4 x float> %v) { 8; CHECK-LABEL: @identity_test_vpermilvar_ps( 9; CHECK-NEXT: ret <4 x float> [[V:%.*]] 10; 11 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 0, i32 1, i32 2, i32 3>) 12 ret <4 x float> %a 13} 14 15define <8 x float> @identity_test_vpermilvar_ps_256(<8 x float> %v) { 16; CHECK-LABEL: @identity_test_vpermilvar_ps_256( 17; CHECK-NEXT: ret <8 x float> [[V:%.*]] 18; 19 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>) 20 ret <8 x float> %a 21} 22 23define <16 x float> @identity_test_vpermilvar_ps_512(<16 x float> %v) { 24; CHECK-LABEL: @identity_test_vpermilvar_ps_512( 25; CHECK-NEXT: ret <16 x float> [[V:%.*]] 26; 27 %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>) 28 ret <16 x float> %a 29} 30 31define <2 x double> @identity_test_vpermilvar_pd(<2 x double> %v) { 32; CHECK-LABEL: @identity_test_vpermilvar_pd( 33; CHECK-NEXT: ret <2 x double> [[V:%.*]] 34; 35 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 0, i64 2>) 36 ret <2 x double> %a 37} 38 39define <4 x double> @identity_test_vpermilvar_pd_256(<4 x double> %v) { 40; CHECK-LABEL: @identity_test_vpermilvar_pd_256( 41; CHECK-NEXT: ret <4 x double> [[V:%.*]] 42; 43 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 0, i64 2, i64 0, i64 2>) 44 ret <4 x double> %a 45} 46 47define <8 x double> @identity_test_vpermilvar_pd_512(<8 x double> %v) { 48; CHECK-LABEL: @identity_test_vpermilvar_pd_512( 49; CHECK-NEXT: ret <8 x double> [[V:%.*]] 50; 51 %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 0, i64 2, i64 0, i64 2, i64 0, i64 2, i64 0, i64 2>) 52 ret <8 x double> %a 53} 54 55; Instcombine should be able to fold the following byte shuffle to a builtin shufflevector 56; with a shuffle mask of all zeroes. 57 58define <4 x float> @zero_test_vpermilvar_ps_zero(<4 x float> %v) { 59; CHECK-LABEL: @zero_test_vpermilvar_ps_zero( 60; CHECK-NEXT: [[A:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer 61; CHECK-NEXT: ret <4 x float> [[A]] 62; 63 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> zeroinitializer) 64 ret <4 x float> %a 65} 66 67define <8 x float> @zero_test_vpermilvar_ps_256_zero(<8 x float> %v) { 68; CHECK-LABEL: @zero_test_vpermilvar_ps_256_zero( 69; CHECK-NEXT: [[A:%.*]] = shufflevector <8 x float> [[V:%.*]], <8 x float> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 70; CHECK-NEXT: ret <8 x float> [[A]] 71; 72 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> zeroinitializer) 73 ret <8 x float> %a 74} 75 76define <16 x float> @zero_test_vpermilvar_ps_512_zero(<16 x float> %v) { 77; CHECK-LABEL: @zero_test_vpermilvar_ps_512_zero( 78; CHECK-NEXT: [[A:%.*]] = shufflevector <16 x float> [[V:%.*]], <16 x float> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> 79; CHECK-NEXT: ret <16 x float> [[A]] 80; 81 %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> zeroinitializer) 82 ret <16 x float> %a 83} 84 85define <2 x double> @zero_test_vpermilvar_pd_zero(<2 x double> %v) { 86; CHECK-LABEL: @zero_test_vpermilvar_pd_zero( 87; CHECK-NEXT: [[A:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer 88; CHECK-NEXT: ret <2 x double> [[A]] 89; 90 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> zeroinitializer) 91 ret <2 x double> %a 92} 93 94define <4 x double> @zero_test_vpermilvar_pd_256_zero(<4 x double> %v) { 95; CHECK-LABEL: @zero_test_vpermilvar_pd_256_zero( 96; CHECK-NEXT: [[A:%.*]] = shufflevector <4 x double> [[V:%.*]], <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 97; CHECK-NEXT: ret <4 x double> [[A]] 98; 99 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> zeroinitializer) 100 ret <4 x double> %a 101} 102 103define <8 x double> @zero_test_vpermilvar_pd_512_zero(<8 x double> %v) { 104; CHECK-LABEL: @zero_test_vpermilvar_pd_512_zero( 105; CHECK-NEXT: [[A:%.*]] = shufflevector <8 x double> [[V:%.*]], <8 x double> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 106; CHECK-NEXT: ret <8 x double> [[A]] 107; 108 %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> zeroinitializer) 109 ret <8 x double> %a 110} 111 112; Verify that instcombine is able to fold constant shuffles. 113 114define <4 x float> @test_vpermilvar_ps(<4 x float> %v) { 115; CHECK-LABEL: @test_vpermilvar_ps( 116; CHECK-NEXT: [[A:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 117; CHECK-NEXT: ret <4 x float> [[A]] 118; 119 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 3, i32 2, i32 1, i32 0>) 120 ret <4 x float> %a 121} 122 123define <8 x float> @test_vpermilvar_ps_256(<8 x float> %v) { 124; CHECK-LABEL: @test_vpermilvar_ps_256( 125; CHECK-NEXT: [[A:%.*]] = shufflevector <8 x float> [[V:%.*]], <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 126; CHECK-NEXT: ret <8 x float> [[A]] 127; 128 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>) 129 ret <8 x float> %a 130} 131 132define <16 x float> @test_vpermilvar_ps_512(<16 x float> %v) { 133; CHECK-LABEL: @test_vpermilvar_ps_512( 134; CHECK-NEXT: [[A:%.*]] = shufflevector <16 x float> [[V:%.*]], <16 x float> poison, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 135; CHECK-NEXT: ret <16 x float> [[A]] 136; 137 %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>) 138 ret <16 x float> %a 139} 140 141define <2 x double> @test_vpermilvar_pd(<2 x double> %v) { 142; CHECK-LABEL: @test_vpermilvar_pd( 143; CHECK-NEXT: [[A:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 0> 144; CHECK-NEXT: ret <2 x double> [[A]] 145; 146 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 2, i64 0>) 147 ret <2 x double> %a 148} 149 150define <4 x double> @test_vpermilvar_pd_256(<4 x double> %v) { 151; CHECK-LABEL: @test_vpermilvar_pd_256( 152; CHECK-NEXT: [[A:%.*]] = shufflevector <4 x double> [[V:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 153; CHECK-NEXT: ret <4 x double> [[A]] 154; 155 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 3, i64 1, i64 2, i64 0>) 156 ret <4 x double> %a 157} 158 159define <8 x double> @test_vpermilvar_pd_512(<8 x double> %v) { 160; CHECK-LABEL: @test_vpermilvar_pd_512( 161; CHECK-NEXT: [[A:%.*]] = shufflevector <8 x double> [[V:%.*]], <8 x double> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 162; CHECK-NEXT: ret <8 x double> [[A]] 163; 164 %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 3, i64 1, i64 2, i64 0, i64 7, i64 5, i64 6, i64 4>) 165 ret <8 x double> %a 166} 167 168; Verify that instcombine is able to fold constant shuffles with undef mask elements. 169 170define <4 x float> @undef_test_vpermilvar_ps(<4 x float> %v) { 171; CHECK-LABEL: @undef_test_vpermilvar_ps( 172; CHECK-NEXT: [[A:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 2, i32 1, i32 poison> 173; CHECK-NEXT: ret <4 x float> [[A]] 174; 175 %a = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %v, <4 x i32> <i32 undef, i32 2, i32 1, i32 undef>) 176 ret <4 x float> %a 177} 178 179define <8 x float> @undef_test_vpermilvar_ps_256(<8 x float> %v) { 180; CHECK-LABEL: @undef_test_vpermilvar_ps_256( 181; CHECK-NEXT: [[A:%.*]] = shufflevector <8 x float> [[V:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4> 182; CHECK-NEXT: ret <8 x float> [[A]] 183; 184 %a = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %v, <8 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>) 185 ret <8 x float> %a 186} 187 188define <16 x float> @undef_test_vpermilvar_ps_512(<16 x float> %v) { 189; CHECK-LABEL: @undef_test_vpermilvar_ps_512( 190; CHECK-NEXT: [[A:%.*]] = shufflevector <16 x float> [[V:%.*]], <16 x float> poison, <16 x i32> <i32 poison, i32 2, i32 1, i32 poison, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 10, i32 9, i32 poison, i32 15, i32 14, i32 13, i32 12> 191; CHECK-NEXT: ret <16 x float> [[A]] 192; 193 %a = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %v, <16 x i32> <i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0, i32 undef, i32 6, i32 5, i32 undef, i32 3, i32 2, i32 1, i32 0>) 194 ret <16 x float> %a 195} 196 197define <2 x double> @undef_test_vpermilvar_pd(<2 x double> %v) { 198; CHECK-LABEL: @undef_test_vpermilvar_pd( 199; CHECK-NEXT: [[A:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> <i32 poison, i32 0> 200; CHECK-NEXT: ret <2 x double> [[A]] 201; 202 %a = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %v, <2 x i64> <i64 undef, i64 0>) 203 ret <2 x double> %a 204} 205 206define <4 x double> @undef_test_vpermilvar_pd_256(<4 x double> %v) { 207; CHECK-LABEL: @undef_test_vpermilvar_pd_256( 208; CHECK-NEXT: [[A:%.*]] = shufflevector <4 x double> [[V:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 0, i32 3, i32 poison> 209; CHECK-NEXT: ret <4 x double> [[A]] 210; 211 %a = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %v, <4 x i64> <i64 undef, i64 1, i64 2, i64 undef>) 212 ret <4 x double> %a 213} 214 215define <8 x double> @undef_test_vpermilvar_pd_512(<8 x double> %v) { 216; CHECK-LABEL: @undef_test_vpermilvar_pd_512( 217; CHECK-NEXT: [[A:%.*]] = shufflevector <8 x double> [[V:%.*]], <8 x double> poison, <8 x i32> <i32 poison, i32 0, i32 3, i32 poison, i32 poison, i32 4, i32 7, i32 poison> 218; CHECK-NEXT: ret <8 x double> [[A]] 219; 220 %a = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %v, <8 x i64> <i64 undef, i64 1, i64 2, i64 undef, i64 undef, i64 1, i64 2, i64 undef>) 221 ret <8 x double> %a 222} 223 224; Simplify demanded bits (PR106413) 225 226define <4 x float> @bits_test_vpermilvar_ps(<4 x float> %InVec, <4 x i32> %InMask) { 227; CHECK-LABEL: @bits_test_vpermilvar_ps( 228; CHECK-NEXT: [[S:%.*]] = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[INVEC:%.*]], <4 x i32> [[INMASK:%.*]]) 229; CHECK-NEXT: ret <4 x float> [[S]] 230; 231 %m = or <4 x i32> %InMask, <i32 0, i32 12, i32 4294967292, i32 -4> 232 %s = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %InVec, <4 x i32> %m) 233 ret <4 x float> %s 234} 235 236define <8 x float> @bits_test_vpermilvar_ps_256(<8 x float> %InVec, <8 x i32> %InMask) { 237; CHECK-LABEL: @bits_test_vpermilvar_ps_256( 238; CHECK-NEXT: [[S:%.*]] = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[INVEC:%.*]], <8 x i32> [[INMASK:%.*]]) 239; CHECK-NEXT: ret <8 x float> [[S]] 240; 241 %m = or <8 x i32> %InMask, <i32 0, i32 12, i32 4294967292, i32 -4, i32 0, i32 12, i32 4294967292, i32 -4> 242 %s = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %InVec, <8 x i32> %m) 243 ret <8 x float> %s 244} 245 246define <16 x float> @bits_test_vpermilvar_ps_512(<16 x float> %InVec, <16 x i32> %InMask) { 247; CHECK-LABEL: @bits_test_vpermilvar_ps_512( 248; CHECK-NEXT: [[S:%.*]] = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[INVEC:%.*]], <16 x i32> [[INMASK:%.*]]) 249; CHECK-NEXT: ret <16 x float> [[S]] 250; 251 %m = or <16 x i32> %InMask, <i32 0, i32 12, i32 4294967292, i32 -4, i32 0, i32 12, i32 4294967292, i32 -4, i32 0, i32 12, i32 4294967292, i32 -4, i32 0, i32 12, i32 4294967292, i32 -4> 252 %s = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %InVec, <16 x i32> %m) 253 ret <16 x float> %s 254} 255 256define <2 x double> @bits_test_vpermilvar_pd(<2 x double> %InVec, <2 x i64> %InMask) { 257; CHECK-LABEL: @bits_test_vpermilvar_pd( 258; CHECK-NEXT: [[S:%.*]] = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[INVEC:%.*]], <2 x i64> [[INMASK:%.*]]) 259; CHECK-NEXT: ret <2 x double> [[S]] 260; 261 %m = or <2 x i64> %InMask, <i64 0, i64 4294967293> 262 %s = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %InVec, <2 x i64> %m) 263 ret <2 x double> %s 264} 265 266define <4 x double> @bits_test_vpermilvar_pd_256(<4 x double> %InVec, <4 x i64> %InMask) { 267; CHECK-LABEL: @bits_test_vpermilvar_pd_256( 268; CHECK-NEXT: [[S:%.*]] = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[INVEC:%.*]], <4 x i64> [[INMASK:%.*]]) 269; CHECK-NEXT: ret <4 x double> [[S]] 270; 271 %m = or <4 x i64> %InMask, <i64 0, i64 1, i64 4294967293, i64 -3> 272 %s = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %InVec, <4 x i64> %m) 273 ret <4 x double> %s 274} 275 276define <8 x double> @bits_test_vpermilvar_pd_512(<8 x double> %InVec, <8 x i64> %InMask) { 277; CHECK-LABEL: @bits_test_vpermilvar_pd_512( 278; CHECK-NEXT: [[S:%.*]] = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[INVEC:%.*]], <8 x i64> [[INMASK:%.*]]) 279; CHECK-NEXT: ret <8 x double> [[S]] 280; 281 %m = or <8 x i64> %InMask, <i64 0, i64 1, i64 4294967293, i64 -3, i64 0, i64 1, i64 4294967293, i64 -3> 282 %s = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %InVec, <8 x i64> %m) 283 ret <8 x double> %s 284} 285 286; negative test - vpermilpd uses bit1 not bit0 for the index bit 287define <2 x double> @bits_test_vpermilvar_pd_negative(<2 x double> %InVec, <2 x i64> %InMask) { 288; CHECK-LABEL: @bits_test_vpermilvar_pd_negative( 289; CHECK-NEXT: [[M:%.*]] = or <2 x i64> [[INMASK:%.*]], <i64 0, i64 2> 290; CHECK-NEXT: [[S:%.*]] = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[INVEC:%.*]], <2 x i64> [[M]]) 291; CHECK-NEXT: ret <2 x double> [[S]] 292; 293 %m = or <2 x i64> %InMask, <i64 0, i64 2> 294 %s = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %InVec, <2 x i64> %m) 295 ret <2 x double> %s 296} 297 298; Simplify demanded elts 299 300define <4 x float> @elts_test_vpermilvar_ps(<4 x float> %a0, i32 %a1) { 301; CHECK-LABEL: @elts_test_vpermilvar_ps( 302; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> 303; CHECK-NEXT: ret <4 x float> [[TMP1]] 304; 305 %1 = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %a1, i32 3 306 %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %1) 307 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef> 308 ret <4 x float> %3 309} 310 311define <8 x float> @elts_test_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) { 312; CHECK-LABEL: @elts_test_vpermilvar_ps_256( 313; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 6, i32 poison, i32 7> 314; CHECK-NEXT: ret <8 x float> [[TMP1]] 315; 316 %1 = shufflevector <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 317 %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %1) 318 %3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7> 319 ret <8 x float> %3 320} 321 322define <16 x float> @elts_test_vpermilvar_ps_512(<16 x float> %a0, <16 x i32> %a1, i32 %a2) { 323; CHECK-LABEL: @elts_test_vpermilvar_ps_512( 324; CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[A0:%.*]], <16 x i32> [[A1:%.*]]) 325; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <16 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 326; CHECK-NEXT: ret <16 x float> [[TMP2]] 327; 328 %1 = insertelement <16 x i32> %a1, i32 %a2, i32 0 329 %2 = tail call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %a0, <16 x i32> %1) 330 %3 = shufflevector <16 x float> %2, <16 x float> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 331 ret <16 x float> %3 332} 333 334define <2 x double> @elts_test_vpermilvar_pd(<2 x double> %a0, i64 %a1) { 335; CHECK-LABEL: @elts_test_vpermilvar_pd( 336; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <2 x i32> <i32 0, i32 poison> 337; CHECK-NEXT: ret <2 x double> [[TMP1]] 338; 339 %1 = insertelement <2 x i64> <i64 0, i64 2>, i64 %a1, i32 1 340 %2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %1) 341 %3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 0, i32 undef> 342 ret <2 x double> %3 343} 344 345define <4 x double> @elts_test_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) { 346; CHECK-LABEL: @elts_test_vpermilvar_pd_256( 347; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 poison> 348; CHECK-NEXT: ret <4 x double> [[TMP1]] 349; 350 %1 = shufflevector <4 x i64> <i64 0, i64 2, i64 0, i64 2>, <4 x i64> %a1, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 351 %2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %1) 352 %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef> 353 ret <4 x double> %3 354} 355 356define <8 x double> @elts_test_vpermilvar_pd_512(<8 x double> %a0, <8 x i64> %a1, i64 %a2) { 357; CHECK-LABEL: @elts_test_vpermilvar_pd_512( 358; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i64> poison, i64 [[A2:%.*]], i64 0 359; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[A0:%.*]], <8 x i64> [[TMP1]]) 360; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <8 x i32> zeroinitializer 361; CHECK-NEXT: ret <8 x double> [[TMP3]] 362; 363 %1 = insertelement <8 x i64> %a1, i64 %a2, i32 0 364 %2 = tail call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %a0, <8 x i64> %1) 365 %3 = shufflevector <8 x double> %2, <8 x double> undef, <8 x i32> zeroinitializer 366 ret <8 x double> %3 367} 368 369declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) 370declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) 371declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>) 372 373declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) 374declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) 375declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>) 376