1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c 6 7define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 8; CHECK-LABEL: test_mm256_add_pd: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 11; CHECK-NEXT: ret{{[l|q]}} 12 %res = fadd <4 x double> %a0, %a1 13 ret <4 x double> %res 14} 15 16define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 17; CHECK-LABEL: test_mm256_add_ps: 18; CHECK: # %bb.0: 19; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 20; CHECK-NEXT: ret{{[l|q]}} 21 %res = fadd <8 x float> %a0, %a1 22 ret <8 x float> %res 23} 24 25define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 26; CHECK-LABEL: test_mm256_addsub_pd: 27; CHECK: # %bb.0: 28; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 29; CHECK-NEXT: ret{{[l|q]}} 30 %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) 31 ret <4 x double> %res 32} 33declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone 34 35define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 36; CHECK-LABEL: test_mm256_addsub_ps: 37; CHECK: # %bb.0: 38; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 39; CHECK-NEXT: ret{{[l|q]}} 40 %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) 41 ret <8 x float> %res 42} 43declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone 44 45define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 46; CHECK-LABEL: test_mm256_and_pd: 47; CHECK: # %bb.0: 48; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 49; CHECK-NEXT: ret{{[l|q]}} 50 %1 = bitcast <4 x double> %a0 to <4 x i64> 51 %2 = bitcast <4 x double> %a1 to <4 x i64> 52 %res = and <4 x i64> %1, %2 53 %bc = bitcast <4 x i64> %res to <4 x double> 54 ret <4 x double> %bc 55} 56 57define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 58; CHECK-LABEL: test_mm256_and_ps: 59; CHECK: # %bb.0: 60; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 61; CHECK-NEXT: ret{{[l|q]}} 62 %1 = bitcast <8 x float> %a0 to <8 x i32> 63 %2 = bitcast <8 x float> %a1 to <8 x i32> 64 %res = and <8 x i32> %1, %2 65 %bc = bitcast <8 x i32> %res to <8 x float> 66 ret <8 x float> %bc 67} 68 69define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 70; CHECK-LABEL: test_mm256_andnot_pd: 71; CHECK: # %bb.0: 72; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 73; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 74; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 75; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 76; CHECK-NEXT: ret{{[l|q]}} 77 %1 = bitcast <4 x double> %a0 to <4 x i64> 78 %2 = bitcast <4 x double> %a1 to <4 x i64> 79 %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1> 80 %res = and <4 x i64> %3, %2 81 %bc = bitcast <4 x i64> %res to <4 x double> 82 ret <4 x double> %bc 83} 84 85define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 86; CHECK-LABEL: test_mm256_andnot_ps: 87; CHECK: # %bb.0: 88; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 89; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 90; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 91; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 92; CHECK-NEXT: ret{{[l|q]}} 93 %1 = bitcast <8 x float> %a0 to <8 x i32> 94 %2 = bitcast <8 x float> %a1 to <8 x i32> 95 %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 96 %res = and <8 x i32> %3, %2 97 %bc = bitcast <8 x i32> %res to <8 x float> 98 ret <8 x float> %bc 99} 100 101define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 102; CHECK-LABEL: test_mm256_blend_pd: 103; CHECK: # %bb.0: 104; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] 105; CHECK-NEXT: ret{{[l|q]}} 106 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> 107 ret <4 x double> %res 108} 109 110define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 111; CHECK-LABEL: test_mm256_blend_ps: 112; CHECK: # %bb.0: 113; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7] 114; CHECK-NEXT: ret{{[l|q]}} 115 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15> 116 ret <8 x float> %res 117} 118 119define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind { 120; CHECK-LABEL: test_mm256_blendv_pd: 121; CHECK: # %bb.0: 122; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 123; CHECK-NEXT: ret{{[l|q]}} 124 %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) 125 ret <4 x double> %res 126} 127declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone 128 129define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind { 130; CHECK-LABEL: test_mm256_blendv_ps: 131; CHECK: # %bb.0: 132; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 133; CHECK-NEXT: ret{{[l|q]}} 134 %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) 135 ret <8 x float> %res 136} 137declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone 138 139define <4 x double> @test_mm256_broadcast_pd(ptr %a0) nounwind { 140; X86-LABEL: test_mm256_broadcast_pd: 141; X86: # %bb.0: 142; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 143; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 144; X86-NEXT: retl 145; 146; X64-LABEL: test_mm256_broadcast_pd: 147; X64: # %bb.0: 148; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 149; X64-NEXT: retq 150 %ld = load <2 x double>, ptr %a0 151 %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 152 ret <4 x double> %res 153} 154 155define <8 x float> @test_mm256_broadcast_ps(ptr %a0) nounwind { 156; X86-LABEL: test_mm256_broadcast_ps: 157; X86: # %bb.0: 158; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 159; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 160; X86-NEXT: retl 161; 162; X64-LABEL: test_mm256_broadcast_ps: 163; X64: # %bb.0: 164; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 165; X64-NEXT: retq 166 %ld = load <4 x float>, ptr %a0 167 %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 168 ret <8 x float> %res 169} 170 171define <4 x double> @test_mm256_broadcast_sd(ptr %a0) nounwind { 172; X86-LABEL: test_mm256_broadcast_sd: 173; X86: # %bb.0: 174; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 175; X86-NEXT: vbroadcastsd (%eax), %ymm0 176; X86-NEXT: retl 177; 178; X64-LABEL: test_mm256_broadcast_sd: 179; X64: # %bb.0: 180; X64-NEXT: vbroadcastsd (%rdi), %ymm0 181; X64-NEXT: retq 182 %ld = load double, ptr %a0 183 %ins0 = insertelement <4 x double> undef, double %ld, i32 0 184 %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1 185 %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2 186 %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3 187 ret <4 x double> %ins3 188} 189 190define <4 x float> @test_mm_broadcast_ss(ptr %a0) nounwind { 191; X86-LABEL: test_mm_broadcast_ss: 192; X86: # %bb.0: 193; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 194; X86-NEXT: vbroadcastss (%eax), %xmm0 195; X86-NEXT: retl 196; 197; X64-LABEL: test_mm_broadcast_ss: 198; X64: # %bb.0: 199; X64-NEXT: vbroadcastss (%rdi), %xmm0 200; X64-NEXT: retq 201 %ld = load float, ptr %a0 202 %ins0 = insertelement <4 x float> undef, float %ld, i32 0 203 %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1 204 %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2 205 %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3 206 ret <4 x float> %ins3 207} 208 209define <8 x float> @test_mm256_broadcast_ss(ptr %a0) nounwind { 210; X86-LABEL: test_mm256_broadcast_ss: 211; X86: # %bb.0: 212; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 213; X86-NEXT: vbroadcastss (%eax), %ymm0 214; X86-NEXT: retl 215; 216; X64-LABEL: test_mm256_broadcast_ss: 217; X64: # %bb.0: 218; X64-NEXT: vbroadcastss (%rdi), %ymm0 219; X64-NEXT: retq 220 %ld = load float, ptr %a0 221 %ins0 = insertelement <8 x float> undef, float %ld, i32 0 222 %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1 223 %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2 224 %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3 225 %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4 226 %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5 227 %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6 228 %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7 229 ret <8 x float> %ins7 230} 231 232define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind { 233; CHECK-LABEL: test_mm256_castpd_ps: 234; CHECK: # %bb.0: 235; CHECK-NEXT: ret{{[l|q]}} 236 %res = bitcast <4 x double> %a0 to <8 x float> 237 ret <8 x float> %res 238} 239 240define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind { 241; CHECK-LABEL: test_mm256_castpd_si256: 242; CHECK: # %bb.0: 243; CHECK-NEXT: ret{{[l|q]}} 244 %res = bitcast <4 x double> %a0 to <4 x i64> 245 ret <4 x i64> %res 246} 247 248define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind { 249; CHECK-LABEL: test_mm256_castpd128_pd256: 250; CHECK: # %bb.0: 251; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 252; CHECK-NEXT: ret{{[l|q]}} 253 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 254 ret <4 x double> %res 255} 256 257define <4 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind { 258; CHECK-LABEL: test_mm256_castpd128_pd256_freeze: 259; CHECK: # %bb.0: 260; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 261; CHECK-NEXT: ret{{[l|q]}} 262 %a1 = freeze <2 x double> poison 263 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 264 ret <4 x double> %res 265} 266 267define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind { 268; CHECK-LABEL: test_mm256_castpd256_pd128: 269; CHECK: # %bb.0: 270; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 271; CHECK-NEXT: vzeroupper 272; CHECK-NEXT: ret{{[l|q]}} 273 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1> 274 ret <2 x double> %res 275} 276 277define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind { 278; CHECK-LABEL: test_mm256_castps_pd: 279; CHECK: # %bb.0: 280; CHECK-NEXT: ret{{[l|q]}} 281 %res = bitcast <8 x float> %a0 to <4 x double> 282 ret <4 x double> %res 283} 284 285define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind { 286; CHECK-LABEL: test_mm256_castps_si256: 287; CHECK: # %bb.0: 288; CHECK-NEXT: ret{{[l|q]}} 289 %res = bitcast <8 x float> %a0 to <4 x i64> 290 ret <4 x i64> %res 291} 292 293define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind { 294; CHECK-LABEL: test_mm256_castps128_ps256: 295; CHECK: # %bb.0: 296; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 297; CHECK-NEXT: ret{{[l|q]}} 298 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 299 ret <8 x float> %res 300} 301 302define <8 x float> @test_mm256_castps128_ps256_freeze(<4 x float> %a0) nounwind { 303; CHECK-LABEL: test_mm256_castps128_ps256_freeze: 304; CHECK: # %bb.0: 305; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 306; CHECK-NEXT: ret{{[l|q]}} 307 %a1 = freeze <4 x float> poison 308 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 309 ret <8 x float> %res 310} 311 312define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind { 313; CHECK-LABEL: test_mm256_castps256_ps128: 314; CHECK: # %bb.0: 315; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 316; CHECK-NEXT: vzeroupper 317; CHECK-NEXT: ret{{[l|q]}} 318 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 319 ret <4 x float> %res 320} 321 322define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind { 323; CHECK-LABEL: test_mm256_castsi128_si256: 324; CHECK: # %bb.0: 325; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 326; CHECK-NEXT: ret{{[l|q]}} 327 %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 328 ret <4 x i64> %res 329} 330 331define <4 x i64> @test_mm256_castsi128_si256_freeze(<2 x i64> %a0) nounwind { 332; CHECK-LABEL: test_mm256_castsi128_si256_freeze: 333; CHECK: # %bb.0: 334; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 335; CHECK-NEXT: ret{{[l|q]}} 336 %a1 = freeze <2 x i64> poison 337 %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 338 ret <4 x i64> %res 339} 340 341define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind { 342; CHECK-LABEL: test_mm256_castsi256_pd: 343; CHECK: # %bb.0: 344; CHECK-NEXT: ret{{[l|q]}} 345 %res = bitcast <4 x i64> %a0 to <4 x double> 346 ret <4 x double> %res 347} 348 349define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind { 350; CHECK-LABEL: test_mm256_castsi256_ps: 351; CHECK: # %bb.0: 352; CHECK-NEXT: ret{{[l|q]}} 353 %res = bitcast <4 x i64> %a0 to <8 x float> 354 ret <8 x float> %res 355} 356 357define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind { 358; CHECK-LABEL: test_mm256_castsi256_si128: 359; CHECK: # %bb.0: 360; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 361; CHECK-NEXT: vzeroupper 362; CHECK-NEXT: ret{{[l|q]}} 363 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1> 364 ret <2 x i64> %res 365} 366 367define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind { 368; CHECK-LABEL: test_mm256_ceil_pd: 369; CHECK: # %bb.0: 370; CHECK-NEXT: vroundpd $2, %ymm0, %ymm0 371; CHECK-NEXT: ret{{[l|q]}} 372 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2) 373 ret <4 x double> %res 374} 375declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone 376 377define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind { 378; CHECK-LABEL: test_mm256_ceil_ps: 379; CHECK: # %bb.0: 380; CHECK-NEXT: vroundps $2, %ymm0, %ymm0 381; CHECK-NEXT: ret{{[l|q]}} 382 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2) 383 ret <8 x float> %res 384} 385declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone 386 387define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 388; CHECK-LABEL: test_mm_cmp_pd: 389; CHECK: # %bb.0: 390; CHECK-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0 391; CHECK-NEXT: ret{{[l|q]}} 392 %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13) 393 ret <2 x double> %res 394} 395declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone 396 397define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 398; CHECK-LABEL: test_mm256_cmp_pd: 399; CHECK: # %bb.0: 400; CHECK-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0 401; CHECK-NEXT: ret{{[l|q]}} 402 %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13) 403 ret <4 x double> %res 404} 405declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 406 407define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 408; CHECK-LABEL: test_mm_cmp_ps: 409; CHECK: # %bb.0: 410; CHECK-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0 411; CHECK-NEXT: ret{{[l|q]}} 412 %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13) 413 ret <4 x float> %res 414} 415declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone 416 417define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 418; CHECK-LABEL: test_mm256_cmp_ps: 419; CHECK: # %bb.0: 420; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0 421; CHECK-NEXT: ret{{[l|q]}} 422 %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13) 423 ret <8 x float> %res 424} 425declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 426 427define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind { 428; CHECK-LABEL: test_mm_cmp_sd: 429; CHECK: # %bb.0: 430; CHECK-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0 431; CHECK-NEXT: ret{{[l|q]}} 432 %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13) 433 ret <2 x double> %res 434} 435declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone 436 437define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind { 438; CHECK-LABEL: test_mm_cmp_ss: 439; CHECK: # %bb.0: 440; CHECK-NEXT: vcmpgess %xmm1, %xmm0, %xmm0 441; CHECK-NEXT: ret{{[l|q]}} 442 %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13) 443 ret <4 x float> %res 444} 445declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone 446 447define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind { 448; CHECK-LABEL: test_mm256_cvtepi32_pd: 449; CHECK: # %bb.0: 450; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 451; CHECK-NEXT: ret{{[l|q]}} 452 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 453 %res = sitofp <4 x i32> %arg0 to <4 x double> 454 ret <4 x double> %res 455} 456 457define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind { 458; CHECK-LABEL: test_mm256_cvtepi32_ps: 459; CHECK: # %bb.0: 460; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 461; CHECK-NEXT: ret{{[l|q]}} 462 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 463 %res = sitofp <8 x i32> %arg0 to <8 x float> 464 ret <8 x float> %res 465} 466 467define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind { 468; CHECK-LABEL: test_mm256_cvtpd_epi32: 469; CHECK: # %bb.0: 470; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 471; CHECK-NEXT: vzeroupper 472; CHECK-NEXT: ret{{[l|q]}} 473 %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) 474 %res = bitcast <4 x i32> %cvt to <2 x i64> 475 ret <2 x i64> %res 476} 477declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone 478 479define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind { 480; CHECK-LABEL: test_mm256_cvtpd_ps: 481; CHECK: # %bb.0: 482; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 483; CHECK-NEXT: vzeroupper 484; CHECK-NEXT: ret{{[l|q]}} 485 %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) 486 ret <4 x float> %res 487} 488declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone 489 490define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind { 491; CHECK-LABEL: test_mm256_cvtps_epi32: 492; CHECK: # %bb.0: 493; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 494; CHECK-NEXT: ret{{[l|q]}} 495 %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) 496 %res = bitcast <8 x i32> %cvt to <4 x i64> 497 ret <4 x i64> %res 498} 499declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone 500 501define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind { 502; CHECK-LABEL: test_mm256_cvtps_pd: 503; CHECK: # %bb.0: 504; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 505; CHECK-NEXT: ret{{[l|q]}} 506 %res = fpext <4 x float> %a0 to <4 x double> 507 ret <4 x double> %res 508} 509 510define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind { 511; CHECK-LABEL: test_mm256_cvttpd_epi32: 512; CHECK: # %bb.0: 513; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 514; CHECK-NEXT: vzeroupper 515; CHECK-NEXT: ret{{[l|q]}} 516 %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) 517 %res = bitcast <4 x i32> %cvt to <2 x i64> 518 ret <2 x i64> %res 519} 520declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone 521 522define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind { 523; CHECK-LABEL: test_mm256_cvttps_epi32: 524; CHECK: # %bb.0: 525; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 526; CHECK-NEXT: ret{{[l|q]}} 527 %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) 528 %res = bitcast <8 x i32> %cvt to <4 x i64> 529 ret <4 x i64> %res 530} 531declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone 532 533define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 534; CHECK-LABEL: test_mm256_div_pd: 535; CHECK: # %bb.0: 536; CHECK-NEXT: vdivpd %ymm1, %ymm0, %ymm0 537; CHECK-NEXT: ret{{[l|q]}} 538 %res = fdiv <4 x double> %a0, %a1 539 ret <4 x double> %res 540} 541 542define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 543; CHECK-LABEL: test_mm256_div_ps: 544; CHECK: # %bb.0: 545; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 546; CHECK-NEXT: ret{{[l|q]}} 547 %res = fdiv <8 x float> %a0, %a1 548 ret <8 x float> %res 549} 550 551define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 552; CHECK-LABEL: test_mm256_dp_ps: 553; CHECK: # %bb.0: 554; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 555; CHECK-NEXT: ret{{[l|q]}} 556 %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) 557 ret <8 x float> %res 558} 559declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 560 561define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind { 562; CHECK-LABEL: test_mm256_extract_epi8: 563; CHECK: # %bb.0: 564; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 565; CHECK-NEXT: vpextrb $15, %xmm0, %eax 566; CHECK-NEXT: movzbl %al, %eax 567; CHECK-NEXT: vzeroupper 568; CHECK-NEXT: ret{{[l|q]}} 569 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 570 %ext = extractelement <32 x i8> %arg0, i32 31 571 %res = zext i8 %ext to i32 572 ret i32 %res 573} 574 575define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind { 576; CHECK-LABEL: test_mm256_extract_epi16: 577; CHECK: # %bb.0: 578; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 579; CHECK-NEXT: vpextrw $3, %xmm0, %eax 580; CHECK-NEXT: movzwl %ax, %eax 581; CHECK-NEXT: vzeroupper 582; CHECK-NEXT: ret{{[l|q]}} 583 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 584 %ext = extractelement <16 x i16> %arg0, i32 11 585 %res = zext i16 %ext to i32 586 ret i32 %res 587} 588 589define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind { 590; CHECK-LABEL: test_mm256_extract_epi32: 591; CHECK: # %bb.0: 592; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 593; CHECK-NEXT: vextractps $1, %xmm0, %eax 594; CHECK-NEXT: vzeroupper 595; CHECK-NEXT: ret{{[l|q]}} 596 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 597 %res = extractelement <8 x i32> %arg0, i32 5 598 ret i32 %res 599} 600 601define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind { 602; X86-LABEL: test_mm256_extract_epi64: 603; X86: # %bb.0: 604; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 605; X86-NEXT: vextractps $2, %xmm0, %eax 606; X86-NEXT: vextractps $3, %xmm0, %edx 607; X86-NEXT: vzeroupper 608; X86-NEXT: retl 609; 610; X64-LABEL: test_mm256_extract_epi64: 611; X64: # %bb.0: 612; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 613; X64-NEXT: vpextrq $1, %xmm0, %rax 614; X64-NEXT: vzeroupper 615; X64-NEXT: retq 616 %res = extractelement <4 x i64> %a0, i32 3 617 ret i64 %res 618} 619 620define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind { 621; CHECK-LABEL: test_mm256_extractf128_pd: 622; CHECK: # %bb.0: 623; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 624; CHECK-NEXT: vzeroupper 625; CHECK-NEXT: ret{{[l|q]}} 626 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3> 627 ret <2 x double> %res 628} 629 630define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind { 631; CHECK-LABEL: test_mm256_extractf128_ps: 632; CHECK: # %bb.0: 633; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 634; CHECK-NEXT: vzeroupper 635; CHECK-NEXT: ret{{[l|q]}} 636 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 637 ret <4 x float> %res 638} 639 640define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind { 641; CHECK-LABEL: test_mm256_extractf128_si256: 642; CHECK: # %bb.0: 643; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 644; CHECK-NEXT: vzeroupper 645; CHECK-NEXT: ret{{[l|q]}} 646 %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3> 647 ret <2 x i64> %res 648} 649 650define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind { 651; CHECK-LABEL: test_mm256_floor_pd: 652; CHECK: # %bb.0: 653; CHECK-NEXT: vroundpd $1, %ymm0, %ymm0 654; CHECK-NEXT: ret{{[l|q]}} 655 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1) 656 ret <4 x double> %res 657} 658 659define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind { 660; CHECK-LABEL: test_mm256_floor_ps: 661; CHECK: # %bb.0: 662; CHECK-NEXT: vroundps $1, %ymm0, %ymm0 663; CHECK-NEXT: ret{{[l|q]}} 664 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1) 665 ret <8 x float> %res 666} 667 668define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 669; CHECK-LABEL: test_mm256_hadd_pd: 670; CHECK: # %bb.0: 671; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 672; CHECK-NEXT: ret{{[l|q]}} 673 %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) 674 ret <4 x double> %res 675} 676declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone 677 678define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 679; CHECK-LABEL: test_mm256_hadd_ps: 680; CHECK: # %bb.0: 681; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 682; CHECK-NEXT: ret{{[l|q]}} 683 %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) 684 ret <8 x float> %res 685} 686declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone 687 688define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 689; CHECK-LABEL: test_mm256_hsub_pd: 690; CHECK: # %bb.0: 691; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 692; CHECK-NEXT: ret{{[l|q]}} 693 %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) 694 ret <4 x double> %res 695} 696declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone 697 698define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 699; CHECK-LABEL: test_mm256_hsub_ps: 700; CHECK: # %bb.0: 701; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0 702; CHECK-NEXT: ret{{[l|q]}} 703 %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) 704 ret <8 x float> %res 705} 706declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone 707 708define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind { 709; X86-LABEL: test_mm256_insert_epi8: 710; X86: # %bb.0: 711; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 712; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1 713; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 714; X86-NEXT: retl 715; 716; X64-LABEL: test_mm256_insert_epi8: 717; X64: # %bb.0: 718; X64-NEXT: vpinsrb $4, %edi, %xmm0, %xmm1 719; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 720; X64-NEXT: retq 721 %arg0 = bitcast <4 x i64> %a0 to <32 x i8> 722 %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4 723 %bc = bitcast <32 x i8> %res to <4 x i64> 724 ret <4 x i64> %bc 725} 726 727define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind { 728; X86-LABEL: test_mm256_insert_epi16: 729; X86: # %bb.0: 730; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 731; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 732; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 733; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 734; X86-NEXT: retl 735; 736; X64-LABEL: test_mm256_insert_epi16: 737; X64: # %bb.0: 738; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 739; X64-NEXT: vpinsrw $6, %edi, %xmm1, %xmm1 740; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 741; X64-NEXT: retq 742 %arg0 = bitcast <4 x i64> %a0 to <16 x i16> 743 %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14 744 %bc = bitcast <16 x i16> %res to <4 x i64> 745 ret <4 x i64> %bc 746} 747 748define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind { 749; X86-LABEL: test_mm256_insert_epi32: 750; X86: # %bb.0: 751; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1 752; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 753; X86-NEXT: retl 754; 755; X64-LABEL: test_mm256_insert_epi32: 756; X64: # %bb.0: 757; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm1 758; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 759; X64-NEXT: retq 760 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 761 %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3 762 %bc = bitcast <8 x i32> %res to <4 x i64> 763 ret <4 x i64> %bc 764} 765 766define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind { 767; X86-LABEL: test_mm256_insert_epi64: 768; X86: # %bb.0: 769; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 770; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 771; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 772; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 773; X86-NEXT: retl 774; 775; X64-LABEL: test_mm256_insert_epi64: 776; X64: # %bb.0: 777; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 778; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1 779; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 780; X64-NEXT: retq 781 %res = insertelement <4 x i64> %a0, i64 %a1, i32 3 782 ret <4 x i64> %res 783} 784 785define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind { 786; CHECK-LABEL: test_mm256_insertf128_pd: 787; CHECK: # %bb.0: 788; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 789; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 790; CHECK-NEXT: ret{{[l|q]}} 791 %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 792 %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 793 ret <4 x double> %res 794} 795 796define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind { 797; CHECK-LABEL: test_mm256_insertf128_ps: 798; CHECK: # %bb.0: 799; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 800; CHECK-NEXT: ret{{[l|q]}} 801 %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 802 %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 803 ret <8 x float> %res 804} 805 806define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind { 807; CHECK-LABEL: test_mm256_insertf128_si256: 808; CHECK: # %bb.0: 809; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 810; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 811; CHECK-NEXT: ret{{[l|q]}} 812 %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 813 %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 814 ret <4 x i64> %res 815} 816 817define <4 x i64> @test_mm256_lddqu_si256(ptr %a0) nounwind { 818; X86-LABEL: test_mm256_lddqu_si256: 819; X86: # %bb.0: 820; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 821; X86-NEXT: vlddqu (%eax), %ymm0 822; X86-NEXT: retl 823; 824; X64-LABEL: test_mm256_lddqu_si256: 825; X64: # %bb.0: 826; X64-NEXT: vlddqu (%rdi), %ymm0 827; X64-NEXT: retq 828 %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr %a0) 829 %bc = bitcast <32 x i8> %res to <4 x i64> 830 ret <4 x i64> %bc 831} 832declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readnone 833 834define <4 x double> @test_mm256_load_pd(ptr %a0) nounwind { 835; X86-LABEL: test_mm256_load_pd: 836; X86: # %bb.0: 837; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 838; X86-NEXT: vmovaps (%eax), %ymm0 839; X86-NEXT: retl 840; 841; X64-LABEL: test_mm256_load_pd: 842; X64: # %bb.0: 843; X64-NEXT: vmovaps (%rdi), %ymm0 844; X64-NEXT: retq 845 %res = load <4 x double>, ptr %a0, align 32 846 ret <4 x double> %res 847} 848 849define <8 x float> @test_mm256_load_ps(ptr %a0) nounwind { 850; X86-LABEL: test_mm256_load_ps: 851; X86: # %bb.0: 852; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 853; X86-NEXT: vmovaps (%eax), %ymm0 854; X86-NEXT: retl 855; 856; X64-LABEL: test_mm256_load_ps: 857; X64: # %bb.0: 858; X64-NEXT: vmovaps (%rdi), %ymm0 859; X64-NEXT: retq 860 %res = load <8 x float>, ptr %a0, align 32 861 ret <8 x float> %res 862} 863 864define <4 x i64> @test_mm256_load_si256(ptr %a0) nounwind { 865; X86-LABEL: test_mm256_load_si256: 866; X86: # %bb.0: 867; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 868; X86-NEXT: vmovaps (%eax), %ymm0 869; X86-NEXT: retl 870; 871; X64-LABEL: test_mm256_load_si256: 872; X64: # %bb.0: 873; X64-NEXT: vmovaps (%rdi), %ymm0 874; X64-NEXT: retq 875 %res = load <4 x i64>, ptr %a0, align 32 876 ret <4 x i64> %res 877} 878 879define <4 x double> @test_mm256_loadu_pd(ptr %a0) nounwind { 880; X86-LABEL: test_mm256_loadu_pd: 881; X86: # %bb.0: 882; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 883; X86-NEXT: vmovups (%eax), %ymm0 884; X86-NEXT: retl 885; 886; X64-LABEL: test_mm256_loadu_pd: 887; X64: # %bb.0: 888; X64-NEXT: vmovups (%rdi), %ymm0 889; X64-NEXT: retq 890 %res = load <4 x double>, ptr %a0, align 1 891 ret <4 x double> %res 892} 893 894define <8 x float> @test_mm256_loadu_ps(ptr %a0) nounwind { 895; X86-LABEL: test_mm256_loadu_ps: 896; X86: # %bb.0: 897; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 898; X86-NEXT: vmovups (%eax), %ymm0 899; X86-NEXT: retl 900; 901; X64-LABEL: test_mm256_loadu_ps: 902; X64: # %bb.0: 903; X64-NEXT: vmovups (%rdi), %ymm0 904; X64-NEXT: retq 905 %res = load <8 x float>, ptr %a0, align 1 906 ret <8 x float> %res 907} 908 909define <4 x i64> @test_mm256_loadu_si256(ptr %a0) nounwind { 910; X86-LABEL: test_mm256_loadu_si256: 911; X86: # %bb.0: 912; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 913; X86-NEXT: vmovups (%eax), %ymm0 914; X86-NEXT: retl 915; 916; X64-LABEL: test_mm256_loadu_si256: 917; X64: # %bb.0: 918; X64-NEXT: vmovups (%rdi), %ymm0 919; X64-NEXT: retq 920 %res = load <4 x i64>, ptr %a0, align 1 921 ret <4 x i64> %res 922} 923 924define <8 x float> @test_mm256_loadu2_m128(ptr %a0, ptr %a1) nounwind { 925; X86-LABEL: test_mm256_loadu2_m128: 926; X86: # %bb.0: 927; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 928; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 929; X86-NEXT: vmovups (%eax), %xmm0 930; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0 931; X86-NEXT: retl 932; 933; X64-LABEL: test_mm256_loadu2_m128: 934; X64: # %bb.0: 935; X64-NEXT: vmovups (%rsi), %xmm0 936; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 937; X64-NEXT: retq 938 %hi4 = load <4 x float>, ptr %a0, align 1 939 %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 940 %lo4 = load <4 x float>, ptr %a1, align 1 941 %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 942 %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 943 ret <8 x float> %res 944} 945 946define <4 x double> @test_mm256_loadu2_m128d(ptr %a0, ptr %a1) nounwind { 947; X86-LABEL: test_mm256_loadu2_m128d: 948; X86: # %bb.0: 949; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 950; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 951; X86-NEXT: vmovups (%eax), %xmm0 952; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0 953; X86-NEXT: retl 954; 955; X64-LABEL: test_mm256_loadu2_m128d: 956; X64: # %bb.0: 957; X64-NEXT: vmovups (%rsi), %xmm0 958; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 959; X64-NEXT: retq 960 %hi2 = load <2 x double>, ptr %a0, align 1 961 %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 962 %lo2 = load <2 x double>, ptr %a1, align 1 963 %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 964 %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 965 ret <4 x double> %res 966} 967 968define <4 x i64> @test_mm256_loadu2_m128i(ptr %a0, ptr %a1) nounwind { 969; X86-LABEL: test_mm256_loadu2_m128i: 970; X86: # %bb.0: 971; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 972; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 973; X86-NEXT: vmovups (%eax), %xmm0 974; X86-NEXT: vinsertf128 $1, (%ecx), %ymm0, %ymm0 975; X86-NEXT: retl 976; 977; X64-LABEL: test_mm256_loadu2_m128i: 978; X64: # %bb.0: 979; X64-NEXT: vmovups (%rsi), %xmm0 980; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 981; X64-NEXT: retq 982 %hi2 = load <2 x i64>, ptr %a0, align 1 983 %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 984 %lo2 = load <2 x i64>, ptr %a1, align 1 985 %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 986 %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 987 ret <4 x i64> %res 988} 989 990define <2 x double> @test_mm_maskload_pd(ptr %a0, <2 x i64> %a1) nounwind { 991; X86-LABEL: test_mm_maskload_pd: 992; X86: # %bb.0: 993; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 994; X86-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0 995; X86-NEXT: retl 996; 997; X64-LABEL: test_mm_maskload_pd: 998; X64: # %bb.0: 999; X64-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm0 1000; X64-NEXT: retq 1001 %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %a1) 1002 ret <2 x double> %res 1003} 1004declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readnone 1005 1006define <4 x double> @test_mm256_maskload_pd(ptr %a0, <4 x i64> %a1) nounwind { 1007; X86-LABEL: test_mm256_maskload_pd: 1008; X86: # %bb.0: 1009; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1010; X86-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0 1011; X86-NEXT: retl 1012; 1013; X64-LABEL: test_mm256_maskload_pd: 1014; X64: # %bb.0: 1015; X64-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 1016; X64-NEXT: retq 1017 %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %a1) 1018 ret <4 x double> %res 1019} 1020declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind readnone 1021 1022define <4 x float> @test_mm_maskload_ps(ptr %a0, <2 x i64> %a1) nounwind { 1023; X86-LABEL: test_mm_maskload_ps: 1024; X86: # %bb.0: 1025; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1026; X86-NEXT: vmaskmovps (%eax), %xmm0, %xmm0 1027; X86-NEXT: retl 1028; 1029; X64-LABEL: test_mm_maskload_ps: 1030; X64: # %bb.0: 1031; X64-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 1032; X64-NEXT: retq 1033 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1034 %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %arg1) 1035 ret <4 x float> %res 1036} 1037declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readnone 1038 1039define <8 x float> @test_mm256_maskload_ps(ptr %a0, <4 x i64> %a1) nounwind { 1040; X86-LABEL: test_mm256_maskload_ps: 1041; X86: # %bb.0: 1042; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1043; X86-NEXT: vmaskmovps (%eax), %ymm0, %ymm0 1044; X86-NEXT: retl 1045; 1046; X64-LABEL: test_mm256_maskload_ps: 1047; X64: # %bb.0: 1048; X64-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 1049; X64-NEXT: retq 1050 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1051 %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %arg1) 1052 ret <8 x float> %res 1053} 1054declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind readnone 1055 1056define void @test_mm_maskstore_pd(ptr %a0, <2 x i64> %a1, <2 x double> %a2) nounwind { 1057; X86-LABEL: test_mm_maskstore_pd: 1058; X86: # %bb.0: 1059; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1060; X86-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax) 1061; X86-NEXT: retl 1062; 1063; X64-LABEL: test_mm_maskstore_pd: 1064; X64: # %bb.0: 1065; X64-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) 1066; X64-NEXT: retq 1067 call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %a1, <2 x double> %a2) 1068 ret void 1069} 1070declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind readnone 1071 1072define void @test_mm256_maskstore_pd(ptr %a0, <4 x i64> %a1, <4 x double> %a2) nounwind { 1073; X86-LABEL: test_mm256_maskstore_pd: 1074; X86: # %bb.0: 1075; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1076; X86-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) 1077; X86-NEXT: vzeroupper 1078; X86-NEXT: retl 1079; 1080; X64-LABEL: test_mm256_maskstore_pd: 1081; X64: # %bb.0: 1082; X64-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) 1083; X64-NEXT: vzeroupper 1084; X64-NEXT: retq 1085 call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %a1, <4 x double> %a2) 1086 ret void 1087} 1088declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwind readnone 1089 1090define void @test_mm_maskstore_ps(ptr %a0, <2 x i64> %a1, <4 x float> %a2) nounwind { 1091; X86-LABEL: test_mm_maskstore_ps: 1092; X86: # %bb.0: 1093; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1094; X86-NEXT: vmaskmovps %xmm1, %xmm0, (%eax) 1095; X86-NEXT: retl 1096; 1097; X64-LABEL: test_mm_maskstore_ps: 1098; X64: # %bb.0: 1099; X64-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) 1100; X64-NEXT: retq 1101 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1102 call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %arg1, <4 x float> %a2) 1103 ret void 1104} 1105declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind readnone 1106 1107define void @test_mm256_maskstore_ps(ptr %a0, <4 x i64> %a1, <8 x float> %a2) nounwind { 1108; X86-LABEL: test_mm256_maskstore_ps: 1109; X86: # %bb.0: 1110; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1111; X86-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) 1112; X86-NEXT: vzeroupper 1113; X86-NEXT: retl 1114; 1115; X64-LABEL: test_mm256_maskstore_ps: 1116; X64: # %bb.0: 1117; X64-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) 1118; X64-NEXT: vzeroupper 1119; X64-NEXT: retq 1120 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1121 call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %arg1, <8 x float> %a2) 1122 ret void 1123} 1124declare void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>) nounwind readnone 1125 1126define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1127; CHECK-LABEL: test_mm256_max_pd: 1128; CHECK: # %bb.0: 1129; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 1130; CHECK-NEXT: ret{{[l|q]}} 1131 %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) 1132 ret <4 x double> %res 1133} 1134declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone 1135 1136define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1137; CHECK-LABEL: test_mm256_max_ps: 1138; CHECK: # %bb.0: 1139; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 1140; CHECK-NEXT: ret{{[l|q]}} 1141 %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) 1142 ret <8 x float> %res 1143} 1144declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone 1145 1146define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1147; CHECK-LABEL: test_mm256_min_pd: 1148; CHECK: # %bb.0: 1149; CHECK-NEXT: vminpd %ymm1, %ymm0, %ymm0 1150; CHECK-NEXT: ret{{[l|q]}} 1151 %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) 1152 ret <4 x double> %res 1153} 1154declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone 1155 1156define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1157; CHECK-LABEL: test_mm256_min_ps: 1158; CHECK: # %bb.0: 1159; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 1160; CHECK-NEXT: ret{{[l|q]}} 1161 %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) 1162 ret <8 x float> %res 1163} 1164declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone 1165 1166define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind { 1167; CHECK-LABEL: test_mm256_movedup_pd: 1168; CHECK: # %bb.0: 1169; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 1170; CHECK-NEXT: ret{{[l|q]}} 1171 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 1172 ret <4 x double> %res 1173} 1174 1175define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind { 1176; CHECK-LABEL: test_mm256_movehdup_ps: 1177; CHECK: # %bb.0: 1178; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 1179; CHECK-NEXT: ret{{[l|q]}} 1180 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 1181 ret <8 x float> %res 1182} 1183 1184define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind { 1185; CHECK-LABEL: test_mm256_moveldup_ps: 1186; CHECK: # %bb.0: 1187; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 1188; CHECK-NEXT: ret{{[l|q]}} 1189 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 1190 ret <8 x float> %res 1191} 1192 1193define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind { 1194; CHECK-LABEL: test_mm256_movemask_pd: 1195; CHECK: # %bb.0: 1196; CHECK-NEXT: vmovmskpd %ymm0, %eax 1197; CHECK-NEXT: vzeroupper 1198; CHECK-NEXT: ret{{[l|q]}} 1199 %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) 1200 ret i32 %res 1201} 1202declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone 1203 1204define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind { 1205; CHECK-LABEL: test_mm256_movemask_ps: 1206; CHECK: # %bb.0: 1207; CHECK-NEXT: vmovmskps %ymm0, %eax 1208; CHECK-NEXT: vzeroupper 1209; CHECK-NEXT: ret{{[l|q]}} 1210 %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) 1211 ret i32 %res 1212} 1213declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone 1214 1215define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1216; CHECK-LABEL: test_mm256_mul_pd: 1217; CHECK: # %bb.0: 1218; CHECK-NEXT: vmulpd %ymm1, %ymm0, %ymm0 1219; CHECK-NEXT: ret{{[l|q]}} 1220 %res = fmul <4 x double> %a0, %a1 1221 ret <4 x double> %res 1222} 1223 1224define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1225; CHECK-LABEL: test_mm256_mul_ps: 1226; CHECK: # %bb.0: 1227; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 1228; CHECK-NEXT: ret{{[l|q]}} 1229 %res = fmul <8 x float> %a0, %a1 1230 ret <8 x float> %res 1231} 1232 1233define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1234; CHECK-LABEL: test_mm256_or_pd: 1235; CHECK: # %bb.0: 1236; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1237; CHECK-NEXT: ret{{[l|q]}} 1238 %1 = bitcast <4 x double> %a0 to <4 x i64> 1239 %2 = bitcast <4 x double> %a1 to <4 x i64> 1240 %res = or <4 x i64> %1, %2 1241 %bc = bitcast <4 x i64> %res to <4 x double> 1242 ret <4 x double> %bc 1243} 1244 1245define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1246; CHECK-LABEL: test_mm256_or_ps: 1247; CHECK: # %bb.0: 1248; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 1249; CHECK-NEXT: ret{{[l|q]}} 1250 %1 = bitcast <8 x float> %a0 to <8 x i32> 1251 %2 = bitcast <8 x float> %a1 to <8 x i32> 1252 %res = or <8 x i32> %1, %2 1253 %bc = bitcast <8 x i32> %res to <8 x float> 1254 ret <8 x float> %bc 1255} 1256 1257define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind { 1258; CHECK-LABEL: test_mm_permute_pd: 1259; CHECK: # %bb.0: 1260; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] 1261; CHECK-NEXT: ret{{[l|q]}} 1262 %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0> 1263 ret <2 x double> %res 1264} 1265 1266define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind { 1267; CHECK-LABEL: test_mm256_permute_pd: 1268; CHECK: # %bb.0: 1269; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] 1270; CHECK-NEXT: ret{{[l|q]}} 1271 %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 1272 ret <4 x double> %res 1273} 1274 1275define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind { 1276; CHECK-LABEL: test_mm_permute_ps: 1277; CHECK: # %bb.0: 1278; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] 1279; CHECK-NEXT: ret{{[l|q]}} 1280 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1281 ret <4 x float> %res 1282} 1283 1284define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind { 1285; CHECK-LABEL: test2_mm_permute_ps: 1286; CHECK: # %bb.0: 1287; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,3] 1288; CHECK-NEXT: ret{{[l|q]}} 1289 %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3> 1290 ret <4 x float> %res 1291} 1292 1293define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind { 1294; CHECK-LABEL: test_mm256_permute_ps: 1295; CHECK: # %bb.0: 1296; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 1297; CHECK-NEXT: ret{{[l|q]}} 1298 %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 1299 ret <8 x float> %res 1300} 1301 1302define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 1303; CHECK-LABEL: test_mm256_permute2f128_pd: 1304; CHECK: # %bb.0: 1305; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1] 1306; CHECK-NEXT: ret{{[l|q]}} 1307 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1308 ret <4 x double> %res 1309} 1310declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone 1311 1312; PR26667 1313define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 1314; CHECK-LABEL: test_mm256_permute2f128_ps: 1315; CHECK: # %bb.0: 1316; CHECK-NEXT: vmovaps %ymm1, %ymm0 1317; CHECK-NEXT: ret{{[l|q]}} 1318 %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 1319 ret <8 x float> %res 1320} 1321declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone 1322 1323define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1324; CHECK-LABEL: test_mm256_permute2f128_si256: 1325; CHECK: # %bb.0: 1326; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] 1327; CHECK-NEXT: ret{{[l|q]}} 1328 %1 = bitcast <4 x i64> %a0 to <8 x i32> 1329 %2 = bitcast <4 x i64> %a1 to <8 x i32> 1330 %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 1331 %bc = bitcast <8 x i32> %res to <4 x i64> 1332 ret <4 x i64> %bc 1333} 1334declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone 1335 1336define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind { 1337; CHECK-LABEL: test_mm_permutevar_pd: 1338; CHECK: # %bb.0: 1339; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 1340; CHECK-NEXT: ret{{[l|q]}} 1341 %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) 1342 ret <2 x double> %res 1343} 1344declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone 1345 1346define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind { 1347; CHECK-LABEL: test_mm256_permutevar_pd: 1348; CHECK: # %bb.0: 1349; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 1350; CHECK-NEXT: ret{{[l|q]}} 1351 %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) 1352 ret <4 x double> %res 1353} 1354declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone 1355 1356define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind { 1357; CHECK-LABEL: test_mm_permutevar_ps: 1358; CHECK: # %bb.0: 1359; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0 1360; CHECK-NEXT: ret{{[l|q]}} 1361 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 1362 %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1) 1363 ret <4 x float> %res 1364} 1365declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone 1366 1367define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind { 1368; CHECK-LABEL: test_mm256_permutevar_ps: 1369; CHECK: # %bb.0: 1370; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0 1371; CHECK-NEXT: ret{{[l|q]}} 1372 %arg1 = bitcast <4 x i64> %a1 to <8 x i32> 1373 %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1) 1374 ret <8 x float> %res 1375} 1376declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone 1377 1378define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind { 1379; CHECK-LABEL: test_mm256_rcp_ps: 1380; CHECK: # %bb.0: 1381; CHECK-NEXT: vrcpps %ymm0, %ymm0 1382; CHECK-NEXT: ret{{[l|q]}} 1383 %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) 1384 ret <8 x float> %res 1385} 1386declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone 1387 1388define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind { 1389; CHECK-LABEL: test_mm256_round_pd: 1390; CHECK: # %bb.0: 1391; CHECK-NEXT: vroundpd $4, %ymm0, %ymm0 1392; CHECK-NEXT: ret{{[l|q]}} 1393 %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4) 1394 ret <4 x double> %res 1395} 1396 1397define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind { 1398; CHECK-LABEL: test_mm256_round_ps: 1399; CHECK: # %bb.0: 1400; CHECK-NEXT: vroundps $4, %ymm0, %ymm0 1401; CHECK-NEXT: ret{{[l|q]}} 1402 %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4) 1403 ret <8 x float> %res 1404} 1405 1406define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind { 1407; CHECK-LABEL: test_mm256_rsqrt_ps: 1408; CHECK: # %bb.0: 1409; CHECK-NEXT: vrsqrtps %ymm0, %ymm0 1410; CHECK-NEXT: ret{{[l|q]}} 1411 %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) 1412 ret <8 x float> %res 1413} 1414declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone 1415 1416define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind { 1417; X86-LABEL: test_mm256_set_epi8: 1418; X86: # %bb.0: 1419; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1420; X86-NEXT: vmovd %eax, %xmm0 1421; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1422; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 1423; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1424; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 1425; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1426; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 1427; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1428; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 1429; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1430; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 1431; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1432; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 1433; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1434; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 1435; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1436; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 1437; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1438; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 1439; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1440; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 1441; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1442; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 1443; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1444; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 1445; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1446; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 1447; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1448; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 1449; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1450; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 1451; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1452; X86-NEXT: vmovd %eax, %xmm1 1453; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1454; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 1455; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1456; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 1457; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1458; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 1459; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1460; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 1461; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1462; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 1463; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1464; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 1465; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1466; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 1467; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1468; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 1469; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1470; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 1471; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1472; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 1473; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1474; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 1475; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1476; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 1477; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1478; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 1479; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1480; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 1481; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1482; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 1483; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1484; X86-NEXT: retl 1485; 1486; X64-LABEL: test_mm256_set_epi8: 1487; X64: # %bb.0: 1488; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1489; X64-NEXT: vmovd %eax, %xmm0 1490; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1491; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 1492; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1493; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 1494; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1495; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 1496; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1497; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 1498; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1499; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 1500; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1501; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 1502; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1503; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 1504; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1505; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 1506; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1507; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 1508; X64-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0 1509; X64-NEXT: vpinsrb $11, %r8d, %xmm0, %xmm0 1510; X64-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 1511; X64-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0 1512; X64-NEXT: vpinsrb $14, %esi, %xmm0, %xmm0 1513; X64-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 1514; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1515; X64-NEXT: vmovd %eax, %xmm1 1516; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1517; X64-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 1518; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1519; X64-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 1520; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1521; X64-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 1522; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1523; X64-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 1524; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1525; X64-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 1526; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1527; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 1528; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1529; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 1530; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1531; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 1532; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1533; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 1534; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1535; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 1536; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1537; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 1538; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1539; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 1540; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1541; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 1542; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1543; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 1544; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 1545; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 1546; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1547; X64-NEXT: retq 1548 %res0 = insertelement <32 x i8> undef, i8 %a31, i32 0 1549 %res1 = insertelement <32 x i8> %res0, i8 %a30, i32 1 1550 %res2 = insertelement <32 x i8> %res1, i8 %a29, i32 2 1551 %res3 = insertelement <32 x i8> %res2, i8 %a28, i32 3 1552 %res4 = insertelement <32 x i8> %res3, i8 %a27, i32 4 1553 %res5 = insertelement <32 x i8> %res4, i8 %a26, i32 5 1554 %res6 = insertelement <32 x i8> %res5, i8 %a25, i32 6 1555 %res7 = insertelement <32 x i8> %res6, i8 %a24, i32 7 1556 %res8 = insertelement <32 x i8> %res7, i8 %a23, i32 8 1557 %res9 = insertelement <32 x i8> %res8, i8 %a22, i32 9 1558 %res10 = insertelement <32 x i8> %res9, i8 %a21, i32 10 1559 %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11 1560 %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12 1561 %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13 1562 %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14 1563 %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15 1564 %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16 1565 %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17 1566 %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18 1567 %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19 1568 %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20 1569 %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21 1570 %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22 1571 %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23 1572 %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24 1573 %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25 1574 %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26 1575 %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27 1576 %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28 1577 %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29 1578 %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30 1579 %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31 1580 %res = bitcast <32 x i8> %res31 to <4 x i64> 1581 ret <4 x i64> %res 1582} 1583 1584define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { 1585; X86-LABEL: test_mm256_set_epi16: 1586; X86: # %bb.0: 1587; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1588; X86-NEXT: vmovd %eax, %xmm0 1589; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1590; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 1591; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1592; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 1593; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1594; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 1595; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1596; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 1597; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1598; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 1599; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1600; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 1601; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1602; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 1603; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1604; X86-NEXT: vmovd %eax, %xmm1 1605; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1606; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 1607; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1608; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 1609; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1610; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 1611; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1612; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 1613; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1614; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 1615; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1616; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 1617; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1618; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 1619; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1620; X86-NEXT: retl 1621; 1622; X64-LABEL: test_mm256_set_epi16: 1623; X64: # %bb.0: 1624; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1625; X64-NEXT: vmovd %eax, %xmm0 1626; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1627; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 1628; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0 1629; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0 1630; X64-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 1631; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 1632; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0 1633; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 1634; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1635; X64-NEXT: vmovd %eax, %xmm1 1636; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1637; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 1638; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1639; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 1640; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1641; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 1642; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1643; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 1644; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1645; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 1646; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1647; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 1648; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 1649; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 1650; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1651; X64-NEXT: retq 1652 %res0 = insertelement <16 x i16> undef, i16 %a15, i32 0 1653 %res1 = insertelement <16 x i16> %res0, i16 %a14, i32 1 1654 %res2 = insertelement <16 x i16> %res1, i16 %a13, i32 2 1655 %res3 = insertelement <16 x i16> %res2, i16 %a12, i32 3 1656 %res4 = insertelement <16 x i16> %res3, i16 %a11, i32 4 1657 %res5 = insertelement <16 x i16> %res4, i16 %a10, i32 5 1658 %res6 = insertelement <16 x i16> %res5, i16 %a9 , i32 6 1659 %res7 = insertelement <16 x i16> %res6, i16 %a8 , i32 7 1660 %res8 = insertelement <16 x i16> %res7, i16 %a7 , i32 8 1661 %res9 = insertelement <16 x i16> %res8, i16 %a6 , i32 9 1662 %res10 = insertelement <16 x i16> %res9, i16 %a5 , i32 10 1663 %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11 1664 %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12 1665 %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13 1666 %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14 1667 %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15 1668 %res = bitcast <16 x i16> %res15 to <4 x i64> 1669 ret <4 x i64> %res 1670} 1671 1672define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind { 1673; X86-LABEL: test_mm256_set_epi32: 1674; X86: # %bb.0: 1675; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1676; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1677; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 1678; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 1679; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1680; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1681; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 1682; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 1683; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1684; X86-NEXT: retl 1685; 1686; X64-LABEL: test_mm256_set_epi32: 1687; X64: # %bb.0: 1688; X64-NEXT: vmovd %ecx, %xmm0 1689; X64-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 1690; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 1691; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 1692; X64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1693; X64-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 1694; X64-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1 1695; X64-NEXT: vpinsrd $3, %r8d, %xmm1, %xmm1 1696; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1697; X64-NEXT: retq 1698 %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0 1699 %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1 1700 %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2 1701 %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3 1702 %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4 1703 %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5 1704 %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6 1705 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7 1706 %res = bitcast <8 x i32> %res7 to <4 x i64> 1707 ret <4 x i64> %res 1708} 1709 1710define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind { 1711; X86-LABEL: test_mm256_set_epi64x: 1712; X86: # %bb.0: 1713; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1714; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1715; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 1716; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 1717; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1718; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1719; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 1720; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 1721; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1722; X86-NEXT: retl 1723; 1724; X64-LABEL: test_mm256_set_epi64x: 1725; X64: # %bb.0: 1726; X64-NEXT: vmovq %rdi, %xmm0 1727; X64-NEXT: vmovq %rsi, %xmm1 1728; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1729; X64-NEXT: vmovq %rdx, %xmm1 1730; X64-NEXT: vmovq %rcx, %xmm2 1731; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 1732; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1733; X64-NEXT: retq 1734 %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0 1735 %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1 1736 %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2 1737 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3 1738 ret <4 x i64> %res3 1739} 1740 1741define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind { 1742; CHECK-LABEL: test_mm256_set_m128: 1743; CHECK: # %bb.0: 1744; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1745; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1746; CHECK-NEXT: ret{{[l|q]}} 1747 %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1748 ret <8 x float> %res 1749} 1750 1751define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind { 1752; CHECK-LABEL: test_mm256_set_m128d: 1753; CHECK: # %bb.0: 1754; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1755; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1756; CHECK-NEXT: ret{{[l|q]}} 1757 %arg0 = bitcast <2 x double> %a0 to <4 x float> 1758 %arg1 = bitcast <2 x double> %a1 to <4 x float> 1759 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1760 %bc = bitcast <8 x float> %res to <4 x double> 1761 ret <4 x double> %bc 1762} 1763 1764define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind { 1765; CHECK-LABEL: test_mm256_set_m128i: 1766; CHECK: # %bb.0: 1767; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 1768; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1769; CHECK-NEXT: ret{{[l|q]}} 1770 %arg0 = bitcast <2 x i64> %a0 to <4 x float> 1771 %arg1 = bitcast <2 x i64> %a1 to <4 x float> 1772 %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1773 %bc = bitcast <8 x float> %res to <4 x i64> 1774 ret <4 x i64> %bc 1775} 1776 1777define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind { 1778; X86-LABEL: test_mm256_set_pd: 1779; X86: # %bb.0: 1780; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1781; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1782; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1783; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 1784; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 1785; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1786; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1787; X86-NEXT: retl 1788; 1789; X64-LABEL: test_mm256_set_pd: 1790; X64: # %bb.0: 1791; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1792; X64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] 1793; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1794; X64-NEXT: retq 1795 %res0 = insertelement <4 x double> undef, double %a3, i32 0 1796 %res1 = insertelement <4 x double> %res0, double %a2, i32 1 1797 %res2 = insertelement <4 x double> %res1, double %a1, i32 2 1798 %res3 = insertelement <4 x double> %res2, double %a0, i32 3 1799 ret <4 x double> %res3 1800} 1801 1802define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind { 1803; X86-LABEL: test_mm256_set_ps: 1804; X86: # %bb.0: 1805; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1806; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1807; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 1808; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1809; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 1810; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1811; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 1812; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1813; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1814; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] 1815; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1816; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] 1817; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1818; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] 1819; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1820; X86-NEXT: retl 1821; 1822; X64-LABEL: test_mm256_set_ps: 1823; X64: # %bb.0: 1824; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 1825; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 1826; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 1827; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3] 1828; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] 1829; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] 1830; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1831; X64-NEXT: retq 1832 %res0 = insertelement <8 x float> undef, float %a7, i32 0 1833 %res1 = insertelement <8 x float> %res0, float %a6, i32 1 1834 %res2 = insertelement <8 x float> %res1, float %a5, i32 2 1835 %res3 = insertelement <8 x float> %res2, float %a4, i32 3 1836 %res4 = insertelement <8 x float> %res3, float %a3, i32 4 1837 %res5 = insertelement <8 x float> %res4, float %a2, i32 5 1838 %res6 = insertelement <8 x float> %res5, float %a1, i32 6 1839 %res7 = insertelement <8 x float> %res6, float %a0, i32 7 1840 ret <8 x float> %res7 1841} 1842 1843define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind { 1844; X86-LABEL: test_mm256_set1_epi8: 1845; X86: # %bb.0: 1846; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 1847; X86-NEXT: vmovd %eax, %xmm0 1848; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 1849; X86-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1850; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1851; X86-NEXT: retl 1852; 1853; X64-LABEL: test_mm256_set1_epi8: 1854; X64: # %bb.0: 1855; X64-NEXT: vmovd %edi, %xmm0 1856; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 1857; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1858; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1859; X64-NEXT: retq 1860 %res0 = insertelement <32 x i8> undef, i8 %a0, i32 0 1861 %res1 = insertelement <32 x i8> %res0, i8 %a0, i32 1 1862 %res2 = insertelement <32 x i8> %res1, i8 %a0, i32 2 1863 %res3 = insertelement <32 x i8> %res2, i8 %a0, i32 3 1864 %res4 = insertelement <32 x i8> %res3, i8 %a0, i32 4 1865 %res5 = insertelement <32 x i8> %res4, i8 %a0, i32 5 1866 %res6 = insertelement <32 x i8> %res5, i8 %a0, i32 6 1867 %res7 = insertelement <32 x i8> %res6, i8 %a0, i32 7 1868 %res8 = insertelement <32 x i8> %res7, i8 %a0, i32 8 1869 %res9 = insertelement <32 x i8> %res8, i8 %a0, i32 9 1870 %res10 = insertelement <32 x i8> %res9, i8 %a0, i32 10 1871 %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11 1872 %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12 1873 %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13 1874 %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14 1875 %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15 1876 %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16 1877 %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17 1878 %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18 1879 %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19 1880 %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20 1881 %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21 1882 %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22 1883 %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23 1884 %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24 1885 %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25 1886 %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26 1887 %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27 1888 %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28 1889 %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29 1890 %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30 1891 %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31 1892 %res = bitcast <32 x i8> %res31 to <4 x i64> 1893 ret <4 x i64> %res 1894} 1895 1896define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { 1897; X86-LABEL: test_mm256_set1_epi16: 1898; X86: # %bb.0: 1899; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 1900; X86-NEXT: vmovd %eax, %xmm0 1901; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1902; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1903; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1904; X86-NEXT: retl 1905; 1906; X64-LABEL: test_mm256_set1_epi16: 1907; X64: # %bb.0: 1908; X64-NEXT: vmovd %edi, %xmm0 1909; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 1910; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1911; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1912; X64-NEXT: retq 1913 %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0 1914 %res1 = insertelement <16 x i16> %res0, i16 %a0, i32 1 1915 %res2 = insertelement <16 x i16> %res1, i16 %a0, i32 2 1916 %res3 = insertelement <16 x i16> %res2, i16 %a0, i32 3 1917 %res4 = insertelement <16 x i16> %res3, i16 %a0, i32 4 1918 %res5 = insertelement <16 x i16> %res4, i16 %a0, i32 5 1919 %res6 = insertelement <16 x i16> %res5, i16 %a0, i32 6 1920 %res7 = insertelement <16 x i16> %res6, i16 %a0, i32 7 1921 %res8 = insertelement <16 x i16> %res7, i16 %a0, i32 8 1922 %res9 = insertelement <16 x i16> %res8, i16 %a0, i32 9 1923 %res10 = insertelement <16 x i16> %res9, i16 %a0, i32 10 1924 %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11 1925 %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12 1926 %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13 1927 %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14 1928 %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15 1929 %res = bitcast <16 x i16> %res15 to <4 x i64> 1930 ret <4 x i64> %res 1931} 1932 1933define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind { 1934; X86-LABEL: test_mm256_set1_epi32: 1935; X86: # %bb.0: 1936; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1937; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 1938; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1939; X86-NEXT: retl 1940; 1941; X64-LABEL: test_mm256_set1_epi32: 1942; X64: # %bb.0: 1943; X64-NEXT: vmovd %edi, %xmm0 1944; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 1945; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1946; X64-NEXT: retq 1947 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0 1948 %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1 1949 %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2 1950 %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3 1951 %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4 1952 %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5 1953 %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6 1954 %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7 1955 %res = bitcast <8 x i32> %res7 to <4 x i64> 1956 ret <4 x i64> %res 1957} 1958 1959define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind { 1960; X86-LABEL: test_mm256_set1_epi64x: 1961; X86: # %bb.0: 1962; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1963; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 1964; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1965; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1966; X86-NEXT: retl 1967; 1968; X64-LABEL: test_mm256_set1_epi64x: 1969; X64: # %bb.0: 1970; X64-NEXT: vmovq %rdi, %xmm0 1971; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1972; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1973; X64-NEXT: retq 1974 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0 1975 %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1 1976 %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2 1977 %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3 1978 ret <4 x i64> %res3 1979} 1980 1981define <4 x double> @test_mm256_set1_pd(double %a0) nounwind { 1982; X86-LABEL: test_mm256_set1_pd: 1983; X86: # %bb.0: 1984; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1985; X86-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1986; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1987; X86-NEXT: retl 1988; 1989; X64-LABEL: test_mm256_set1_pd: 1990; X64: # %bb.0: 1991; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1992; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1993; X64-NEXT: retq 1994 %res0 = insertelement <4 x double> undef, double %a0, i32 0 1995 %res1 = insertelement <4 x double> %res0, double %a0, i32 1 1996 %res2 = insertelement <4 x double> %res1, double %a0, i32 2 1997 %res3 = insertelement <4 x double> %res2, double %a0, i32 3 1998 ret <4 x double> %res3 1999} 2000 2001define <8 x float> @test_mm256_set1_ps(float %a0) nounwind { 2002; X86-LABEL: test_mm256_set1_ps: 2003; X86: # %bb.0: 2004; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2005; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2006; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2007; X86-NEXT: retl 2008; 2009; X64-LABEL: test_mm256_set1_ps: 2010; X64: # %bb.0: 2011; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2012; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2013; X64-NEXT: retq 2014 %res0 = insertelement <8 x float> undef, float %a0, i32 0 2015 %res1 = insertelement <8 x float> %res0, float %a0, i32 1 2016 %res2 = insertelement <8 x float> %res1, float %a0, i32 2 2017 %res3 = insertelement <8 x float> %res2, float %a0, i32 3 2018 %res4 = insertelement <8 x float> %res3, float %a0, i32 4 2019 %res5 = insertelement <8 x float> %res4, float %a0, i32 5 2020 %res6 = insertelement <8 x float> %res5, float %a0, i32 6 2021 %res7 = insertelement <8 x float> %res6, float %a0, i32 7 2022 ret <8 x float> %res7 2023} 2024 2025define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind { 2026; X86-LABEL: test_mm256_setr_epi8: 2027; X86: # %bb.0: 2028; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2029; X86-NEXT: vmovd %eax, %xmm0 2030; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2031; X86-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 2032; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2033; X86-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 2034; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2035; X86-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 2036; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2037; X86-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 2038; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2039; X86-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 2040; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2041; X86-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 2042; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2043; X86-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 2044; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2045; X86-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 2046; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2047; X86-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 2048; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2049; X86-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 2050; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2051; X86-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 2052; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2053; X86-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 2054; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2055; X86-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 2056; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2057; X86-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 2058; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2059; X86-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 2060; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2061; X86-NEXT: vmovd %eax, %xmm1 2062; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2063; X86-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 2064; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2065; X86-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 2066; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2067; X86-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 2068; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2069; X86-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 2070; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2071; X86-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 2072; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2073; X86-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 2074; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2075; X86-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 2076; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2077; X86-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 2078; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2079; X86-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 2080; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2081; X86-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 2082; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2083; X86-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 2084; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2085; X86-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 2086; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2087; X86-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 2088; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2089; X86-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 2090; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2091; X86-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 2092; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2093; X86-NEXT: retl 2094; 2095; X64-LABEL: test_mm256_setr_epi8: 2096; X64: # %bb.0: 2097; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2098; X64-NEXT: vmovd %eax, %xmm0 2099; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2100; X64-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 2101; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2102; X64-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 2103; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2104; X64-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 2105; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2106; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 2107; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2108; X64-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 2109; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2110; X64-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 2111; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2112; X64-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 2113; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2114; X64-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 2115; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2116; X64-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 2117; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2118; X64-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 2119; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2120; X64-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 2121; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2122; X64-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 2123; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2124; X64-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 2125; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2126; X64-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 2127; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2128; X64-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 2129; X64-NEXT: vmovd %edi, %xmm1 2130; X64-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1 2131; X64-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 2132; X64-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 2133; X64-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1 2134; X64-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1 2135; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2136; X64-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 2137; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2138; X64-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 2139; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2140; X64-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 2141; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2142; X64-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 2143; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2144; X64-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 2145; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2146; X64-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 2147; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2148; X64-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 2149; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2150; X64-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 2151; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2152; X64-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 2153; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax 2154; X64-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 2155; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2156; X64-NEXT: retq 2157 %res0 = insertelement <32 x i8> undef, i8 %a0 , i32 0 2158 %res1 = insertelement <32 x i8> %res0, i8 %a1 , i32 1 2159 %res2 = insertelement <32 x i8> %res1, i8 %a2 , i32 2 2160 %res3 = insertelement <32 x i8> %res2, i8 %a3 , i32 3 2161 %res4 = insertelement <32 x i8> %res3, i8 %a4 , i32 4 2162 %res5 = insertelement <32 x i8> %res4, i8 %a5 , i32 5 2163 %res6 = insertelement <32 x i8> %res5, i8 %a6 , i32 6 2164 %res7 = insertelement <32 x i8> %res6, i8 %a7 , i32 7 2165 %res8 = insertelement <32 x i8> %res7, i8 %a8 , i32 8 2166 %res9 = insertelement <32 x i8> %res8, i8 %a9 , i32 9 2167 %res10 = insertelement <32 x i8> %res9, i8 %a10, i32 10 2168 %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11 2169 %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12 2170 %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13 2171 %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14 2172 %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15 2173 %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16 2174 %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17 2175 %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18 2176 %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19 2177 %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20 2178 %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21 2179 %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22 2180 %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23 2181 %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24 2182 %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25 2183 %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26 2184 %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27 2185 %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28 2186 %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29 2187 %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30 2188 %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31 2189 %res = bitcast <32 x i8> %res31 to <4 x i64> 2190 ret <4 x i64> %res 2191} 2192 2193define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { 2194; X86-LABEL: test_mm256_setr_epi16: 2195; X86: # %bb.0: 2196; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2197; X86-NEXT: vmovd %eax, %xmm0 2198; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2199; X86-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 2200; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2201; X86-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 2202; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2203; X86-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 2204; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2205; X86-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 2206; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2207; X86-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 2208; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2209; X86-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 2210; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2211; X86-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 2212; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2213; X86-NEXT: vmovd %eax, %xmm1 2214; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2215; X86-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 2216; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2217; X86-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 2218; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2219; X86-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 2220; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2221; X86-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 2222; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2223; X86-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 2224; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2225; X86-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 2226; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax 2227; X86-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 2228; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2229; X86-NEXT: retl 2230; 2231; X64-LABEL: test_mm256_setr_epi16: 2232; X64: # %bb.0: 2233; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2234; X64-NEXT: vmovd %eax, %xmm0 2235; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2236; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 2237; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2238; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 2239; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2240; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 2241; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2242; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0 2243; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2244; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 2245; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2246; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 2247; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2248; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 2249; X64-NEXT: vmovd %edi, %xmm1 2250; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 2251; X64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 2252; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 2253; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1 2254; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1 2255; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2256; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 2257; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax 2258; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 2259; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2260; X64-NEXT: retq 2261 %res0 = insertelement <16 x i16> undef, i16 %a0 , i32 0 2262 %res1 = insertelement <16 x i16> %res0, i16 %a1 , i32 1 2263 %res2 = insertelement <16 x i16> %res1, i16 %a2 , i32 2 2264 %res3 = insertelement <16 x i16> %res2, i16 %a3 , i32 3 2265 %res4 = insertelement <16 x i16> %res3, i16 %a4 , i32 4 2266 %res5 = insertelement <16 x i16> %res4, i16 %a5 , i32 5 2267 %res6 = insertelement <16 x i16> %res5, i16 %a6 , i32 6 2268 %res7 = insertelement <16 x i16> %res6, i16 %a7 , i32 7 2269 %res8 = insertelement <16 x i16> %res7, i16 %a8 , i32 8 2270 %res9 = insertelement <16 x i16> %res8, i16 %a9 , i32 9 2271 %res10 = insertelement <16 x i16> %res9, i16 %a10, i32 10 2272 %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11 2273 %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12 2274 %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13 2275 %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14 2276 %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15 2277 %res = bitcast <16 x i16> %res15 to <4 x i64> 2278 ret <4 x i64> %res 2279} 2280 2281define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind { 2282; X86-LABEL: test_mm256_setr_epi32: 2283; X86: # %bb.0: 2284; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2285; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 2286; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 2287; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 2288; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2289; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 2290; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 2291; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 2292; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2293; X86-NEXT: retl 2294; 2295; X64-LABEL: test_mm256_setr_epi32: 2296; X64: # %bb.0: 2297; X64-NEXT: vmovd %r8d, %xmm0 2298; X64-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0 2299; X64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 2300; X64-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 2301; X64-NEXT: vmovd %edi, %xmm1 2302; X64-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1 2303; X64-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 2304; X64-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 2305; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2306; X64-NEXT: retq 2307 %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0 2308 %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1 2309 %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2 2310 %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3 2311 %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4 2312 %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5 2313 %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6 2314 %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7 2315 %res = bitcast <8 x i32> %res7 to <4 x i64> 2316 ret <4 x i64> %res 2317} 2318 2319define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind { 2320; X86-LABEL: test_mm256_setr_epi64x: 2321; X86: # %bb.0: 2322; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2323; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 2324; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 2325; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 2326; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2327; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 2328; X86-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 2329; X86-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 2330; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2331; X86-NEXT: retl 2332; 2333; X64-LABEL: test_mm256_setr_epi64x: 2334; X64: # %bb.0: 2335; X64-NEXT: vmovq %rcx, %xmm0 2336; X64-NEXT: vmovq %rdx, %xmm1 2337; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2338; X64-NEXT: vmovq %rsi, %xmm1 2339; X64-NEXT: vmovq %rdi, %xmm2 2340; X64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] 2341; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2342; X64-NEXT: retq 2343 %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0 2344 %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1 2345 %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2 2346 %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3 2347 ret <4 x i64> %res3 2348} 2349 2350define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind { 2351; CHECK-LABEL: test_mm256_setr_m128: 2352; CHECK: # %bb.0: 2353; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2354; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2355; CHECK-NEXT: ret{{[l|q]}} 2356 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2357 ret <8 x float> %res 2358} 2359 2360define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind { 2361; CHECK-LABEL: test_mm256_setr_m128d: 2362; CHECK: # %bb.0: 2363; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2364; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2365; CHECK-NEXT: ret{{[l|q]}} 2366 %arg0 = bitcast <2 x double> %a0 to <4 x float> 2367 %arg1 = bitcast <2 x double> %a1 to <4 x float> 2368 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2369 %bc = bitcast <8 x float> %res to <4 x double> 2370 ret <4 x double> %bc 2371} 2372 2373define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind { 2374; CHECK-LABEL: test_mm256_setr_m128i: 2375; CHECK: # %bb.0: 2376; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 2377; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2378; CHECK-NEXT: ret{{[l|q]}} 2379 %arg0 = bitcast <2 x i64> %a0 to <4 x float> 2380 %arg1 = bitcast <2 x i64> %a1 to <4 x float> 2381 %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2382 %bc = bitcast <8 x float> %res to <4 x i64> 2383 ret <4 x i64> %bc 2384} 2385 2386define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind { 2387; X86-LABEL: test_mm256_setr_pd: 2388; X86: # %bb.0: 2389; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2390; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2391; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2392; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2393; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 2394; X86-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] 2395; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2396; X86-NEXT: retl 2397; 2398; X64-LABEL: test_mm256_setr_pd: 2399; X64: # %bb.0: 2400; X64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 2401; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2402; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 2403; X64-NEXT: retq 2404 %res0 = insertelement <4 x double> undef, double %a0, i32 0 2405 %res1 = insertelement <4 x double> %res0, double %a1, i32 1 2406 %res2 = insertelement <4 x double> %res1, double %a2, i32 2 2407 %res3 = insertelement <4 x double> %res2, double %a3, i32 3 2408 ret <4 x double> %res3 2409} 2410 2411define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind { 2412; X86-LABEL: test_mm256_setr_ps: 2413; X86: # %bb.0: 2414; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2415; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2416; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2417; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 2418; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] 2419; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] 2420; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 2421; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 2422; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 2423; X86-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 2424; X86-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero 2425; X86-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] 2426; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] 2427; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] 2428; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2429; X86-NEXT: retl 2430; 2431; X64-LABEL: test_mm256_setr_ps: 2432; X64: # %bb.0: 2433; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] 2434; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] 2435; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] 2436; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 2437; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] 2438; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 2439; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 2440; X64-NEXT: retq 2441 %res0 = insertelement <8 x float> undef, float %a0, i32 0 2442 %res1 = insertelement <8 x float> %res0, float %a1, i32 1 2443 %res2 = insertelement <8 x float> %res1, float %a2, i32 2 2444 %res3 = insertelement <8 x float> %res2, float %a3, i32 3 2445 %res4 = insertelement <8 x float> %res3, float %a4, i32 4 2446 %res5 = insertelement <8 x float> %res4, float %a5, i32 5 2447 %res6 = insertelement <8 x float> %res5, float %a6, i32 6 2448 %res7 = insertelement <8 x float> %res6, float %a7, i32 7 2449 ret <8 x float> %res7 2450} 2451 2452define <4 x double> @test_mm256_setzero_pd() nounwind { 2453; CHECK-LABEL: test_mm256_setzero_pd: 2454; CHECK: # %bb.0: 2455; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2456; CHECK-NEXT: ret{{[l|q]}} 2457 ret <4 x double> zeroinitializer 2458} 2459 2460define <8 x float> @test_mm256_setzero_ps() nounwind { 2461; CHECK-LABEL: test_mm256_setzero_ps: 2462; CHECK: # %bb.0: 2463; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2464; CHECK-NEXT: ret{{[l|q]}} 2465 ret <8 x float> zeroinitializer 2466} 2467 2468define <4 x i64> @test_mm256_setzero_si256() nounwind { 2469; CHECK-LABEL: test_mm256_setzero_si256: 2470; CHECK: # %bb.0: 2471; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 2472; CHECK-NEXT: ret{{[l|q]}} 2473 ret <4 x i64> zeroinitializer 2474} 2475 2476define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2477; CHECK-LABEL: test_mm256_shuffle_pd: 2478; CHECK: # %bb.0: 2479; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 2480; CHECK-NEXT: ret{{[l|q]}} 2481 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 2482 ret <4 x double> %res 2483} 2484 2485define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2486; CHECK-LABEL: test_mm256_shuffle_ps: 2487; CHECK: # %bb.0: 2488; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] 2489; CHECK-NEXT: ret{{[l|q]}} 2490 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12> 2491 ret <8 x float> %res 2492} 2493 2494define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind { 2495; CHECK-LABEL: test_mm256_sqrt_pd: 2496; CHECK: # %bb.0: # %entry 2497; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 2498; CHECK-NEXT: ret{{[l|q]}} 2499entry: 2500 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2 2501 ret <4 x double> %0 2502} 2503 2504declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1 2505 2506define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind { 2507; CHECK-LABEL: test_mm256_sqrt_ps: 2508; CHECK: # %bb.0: # %entry 2509; CHECK-NEXT: vsqrtps %ymm0, %ymm0 2510; CHECK-NEXT: ret{{[l|q]}} 2511entry: 2512 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2 2513 ret <8 x float> %0 2514} 2515 2516declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1 2517 2518define void @test_mm256_store_pd(ptr %a0, <4 x double> %a1) nounwind { 2519; X86-LABEL: test_mm256_store_pd: 2520; X86: # %bb.0: 2521; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2522; X86-NEXT: vmovaps %ymm0, (%eax) 2523; X86-NEXT: vzeroupper 2524; X86-NEXT: retl 2525; 2526; X64-LABEL: test_mm256_store_pd: 2527; X64: # %bb.0: 2528; X64-NEXT: vmovaps %ymm0, (%rdi) 2529; X64-NEXT: vzeroupper 2530; X64-NEXT: retq 2531 store <4 x double> %a1, ptr %a0, align 32 2532 ret void 2533} 2534 2535define void @test_mm256_store_ps(ptr %a0, <8 x float> %a1) nounwind { 2536; X86-LABEL: test_mm256_store_ps: 2537; X86: # %bb.0: 2538; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2539; X86-NEXT: vmovaps %ymm0, (%eax) 2540; X86-NEXT: vzeroupper 2541; X86-NEXT: retl 2542; 2543; X64-LABEL: test_mm256_store_ps: 2544; X64: # %bb.0: 2545; X64-NEXT: vmovaps %ymm0, (%rdi) 2546; X64-NEXT: vzeroupper 2547; X64-NEXT: retq 2548 store <8 x float> %a1, ptr %a0, align 32 2549 ret void 2550} 2551 2552define void @test_mm256_store_si256(ptr %a0, <4 x i64> %a1) nounwind { 2553; X86-LABEL: test_mm256_store_si256: 2554; X86: # %bb.0: 2555; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2556; X86-NEXT: vmovaps %ymm0, (%eax) 2557; X86-NEXT: vzeroupper 2558; X86-NEXT: retl 2559; 2560; X64-LABEL: test_mm256_store_si256: 2561; X64: # %bb.0: 2562; X64-NEXT: vmovaps %ymm0, (%rdi) 2563; X64-NEXT: vzeroupper 2564; X64-NEXT: retq 2565 store <4 x i64> %a1, ptr %a0, align 32 2566 ret void 2567} 2568 2569define void @test_mm256_storeu_pd(ptr %a0, <4 x double> %a1) nounwind { 2570; X86-LABEL: test_mm256_storeu_pd: 2571; X86: # %bb.0: 2572; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2573; X86-NEXT: vmovups %ymm0, (%eax) 2574; X86-NEXT: vzeroupper 2575; X86-NEXT: retl 2576; 2577; X64-LABEL: test_mm256_storeu_pd: 2578; X64: # %bb.0: 2579; X64-NEXT: vmovups %ymm0, (%rdi) 2580; X64-NEXT: vzeroupper 2581; X64-NEXT: retq 2582 store <4 x double> %a1, ptr %a0, align 1 2583 ret void 2584} 2585 2586define void @test_mm256_storeu_ps(ptr %a0, <8 x float> %a1) nounwind { 2587; X86-LABEL: test_mm256_storeu_ps: 2588; X86: # %bb.0: 2589; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2590; X86-NEXT: vmovups %ymm0, (%eax) 2591; X86-NEXT: vzeroupper 2592; X86-NEXT: retl 2593; 2594; X64-LABEL: test_mm256_storeu_ps: 2595; X64: # %bb.0: 2596; X64-NEXT: vmovups %ymm0, (%rdi) 2597; X64-NEXT: vzeroupper 2598; X64-NEXT: retq 2599 store <8 x float> %a1, ptr %a0, align 1 2600 ret void 2601} 2602 2603define void @test_mm256_storeu_si256(ptr %a0, <4 x i64> %a1) nounwind { 2604; X86-LABEL: test_mm256_storeu_si256: 2605; X86: # %bb.0: 2606; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2607; X86-NEXT: vmovups %ymm0, (%eax) 2608; X86-NEXT: vzeroupper 2609; X86-NEXT: retl 2610; 2611; X64-LABEL: test_mm256_storeu_si256: 2612; X64: # %bb.0: 2613; X64-NEXT: vmovups %ymm0, (%rdi) 2614; X64-NEXT: vzeroupper 2615; X64-NEXT: retq 2616 store <4 x i64> %a1, ptr %a0, align 1 2617 ret void 2618} 2619 2620define void @test_mm256_storeu2_m128(ptr %a0, ptr %a1, <8 x float> %a2) nounwind { 2621; X86-LABEL: test_mm256_storeu2_m128: 2622; X86: # %bb.0: 2623; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2624; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 2625; X86-NEXT: vmovups %xmm0, (%ecx) 2626; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 2627; X86-NEXT: vmovups %xmm0, (%eax) 2628; X86-NEXT: vzeroupper 2629; X86-NEXT: retl 2630; 2631; X64-LABEL: test_mm256_storeu2_m128: 2632; X64: # %bb.0: 2633; X64-NEXT: vmovups %xmm0, (%rdi) 2634; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 2635; X64-NEXT: vmovups %xmm0, (%rsi) 2636; X64-NEXT: vzeroupper 2637; X64-NEXT: retq 2638 %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2639 store <4 x float> %lo, ptr %a0, align 1 2640 %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 2641 store <4 x float> %hi, ptr %a1, align 1 2642 ret void 2643} 2644 2645define void @test_mm256_storeu2_m128d(ptr %a0, ptr %a1, <4 x double> %a2) nounwind { 2646; X86-LABEL: test_mm256_storeu2_m128d: 2647; X86: # %bb.0: 2648; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2649; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 2650; X86-NEXT: vmovups %xmm0, (%ecx) 2651; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 2652; X86-NEXT: vmovups %xmm0, (%eax) 2653; X86-NEXT: vzeroupper 2654; X86-NEXT: retl 2655; 2656; X64-LABEL: test_mm256_storeu2_m128d: 2657; X64: # %bb.0: 2658; X64-NEXT: vmovups %xmm0, (%rdi) 2659; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 2660; X64-NEXT: vmovups %xmm0, (%rsi) 2661; X64-NEXT: vzeroupper 2662; X64-NEXT: retq 2663 %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1> 2664 store <2 x double> %lo, ptr %a0, align 1 2665 %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3> 2666 store <2 x double> %hi, ptr %a1, align 1 2667 ret void 2668} 2669 2670define void @test_mm256_storeu2_m128i(ptr %a0, ptr %a1, <4 x i64> %a2) nounwind { 2671; X86-LABEL: test_mm256_storeu2_m128i: 2672; X86: # %bb.0: 2673; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2674; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 2675; X86-NEXT: vmovups %xmm0, (%ecx) 2676; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 2677; X86-NEXT: vmovups %xmm0, (%eax) 2678; X86-NEXT: vzeroupper 2679; X86-NEXT: retl 2680; 2681; X64-LABEL: test_mm256_storeu2_m128i: 2682; X64: # %bb.0: 2683; X64-NEXT: vmovups %xmm0, (%rdi) 2684; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 2685; X64-NEXT: vmovups %xmm0, (%rsi) 2686; X64-NEXT: vzeroupper 2687; X64-NEXT: retq 2688 %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1> 2689 store <2 x i64> %lo, ptr %a0, align 1 2690 %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3> 2691 store <2 x i64> %hi, ptr %a1, align 1 2692 ret void 2693} 2694 2695define void @test_mm256_stream_pd(ptr%a0, <4 x double> %a1) nounwind { 2696; X86-LABEL: test_mm256_stream_pd: 2697; X86: # %bb.0: 2698; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2699; X86-NEXT: vmovntps %ymm0, (%eax) 2700; X86-NEXT: vzeroupper 2701; X86-NEXT: retl 2702; 2703; X64-LABEL: test_mm256_stream_pd: 2704; X64: # %bb.0: 2705; X64-NEXT: vmovntps %ymm0, (%rdi) 2706; X64-NEXT: vzeroupper 2707; X64-NEXT: retq 2708 store <4 x double> %a1, ptr %a0, align 32, !nontemporal !0 2709 ret void 2710} 2711 2712define void @test_mm256_stream_ps(ptr%a0, <8 x float> %a1) nounwind { 2713; X86-LABEL: test_mm256_stream_ps: 2714; X86: # %bb.0: 2715; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2716; X86-NEXT: vmovntps %ymm0, (%eax) 2717; X86-NEXT: vzeroupper 2718; X86-NEXT: retl 2719; 2720; X64-LABEL: test_mm256_stream_ps: 2721; X64: # %bb.0: 2722; X64-NEXT: vmovntps %ymm0, (%rdi) 2723; X64-NEXT: vzeroupper 2724; X64-NEXT: retq 2725 store <8 x float> %a1, ptr %a0, align 32, !nontemporal !0 2726 ret void 2727} 2728 2729define void @test_mm256_stream_si256(ptr%a0, <4 x i64> %a1) nounwind { 2730; X86-LABEL: test_mm256_stream_si256: 2731; X86: # %bb.0: 2732; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2733; X86-NEXT: vmovntps %ymm0, (%eax) 2734; X86-NEXT: vzeroupper 2735; X86-NEXT: retl 2736; 2737; X64-LABEL: test_mm256_stream_si256: 2738; X64: # %bb.0: 2739; X64-NEXT: vmovntps %ymm0, (%rdi) 2740; X64-NEXT: vzeroupper 2741; X64-NEXT: retq 2742 store <4 x i64> %a1, ptr %a0, align 32, !nontemporal !0 2743 ret void 2744} 2745 2746define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2747; CHECK-LABEL: test_mm256_sub_pd: 2748; CHECK: # %bb.0: 2749; CHECK-NEXT: vsubpd %ymm1, %ymm0, %ymm0 2750; CHECK-NEXT: ret{{[l|q]}} 2751 %res = fsub <4 x double> %a0, %a1 2752 ret <4 x double> %res 2753} 2754 2755define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2756; CHECK-LABEL: test_mm256_sub_ps: 2757; CHECK: # %bb.0: 2758; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 2759; CHECK-NEXT: ret{{[l|q]}} 2760 %res = fsub <8 x float> %a0, %a1 2761 ret <8 x float> %res 2762} 2763 2764define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 2765; CHECK-LABEL: test_mm_testc_pd: 2766; CHECK: # %bb.0: 2767; CHECK-NEXT: xorl %eax, %eax 2768; CHECK-NEXT: vtestpd %xmm1, %xmm0 2769; CHECK-NEXT: setb %al 2770; CHECK-NEXT: ret{{[l|q]}} 2771 %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) 2772 ret i32 %res 2773} 2774declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone 2775 2776define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2777; CHECK-LABEL: test_mm256_testc_pd: 2778; CHECK: # %bb.0: 2779; CHECK-NEXT: xorl %eax, %eax 2780; CHECK-NEXT: vtestpd %ymm1, %ymm0 2781; CHECK-NEXT: setb %al 2782; CHECK-NEXT: vzeroupper 2783; CHECK-NEXT: ret{{[l|q]}} 2784 %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) 2785 ret i32 %res 2786} 2787declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone 2788 2789define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2790; CHECK-LABEL: test_mm_testc_ps: 2791; CHECK: # %bb.0: 2792; CHECK-NEXT: xorl %eax, %eax 2793; CHECK-NEXT: vtestps %xmm1, %xmm0 2794; CHECK-NEXT: setb %al 2795; CHECK-NEXT: ret{{[l|q]}} 2796 %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) 2797 ret i32 %res 2798} 2799declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone 2800 2801define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2802; CHECK-LABEL: test_mm256_testc_ps: 2803; CHECK: # %bb.0: 2804; CHECK-NEXT: xorl %eax, %eax 2805; CHECK-NEXT: vtestps %ymm1, %ymm0 2806; CHECK-NEXT: setb %al 2807; CHECK-NEXT: vzeroupper 2808; CHECK-NEXT: ret{{[l|q]}} 2809 %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) 2810 ret i32 %res 2811} 2812declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone 2813 2814define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2815; CHECK-LABEL: test_mm256_testc_si256: 2816; CHECK: # %bb.0: 2817; CHECK-NEXT: xorl %eax, %eax 2818; CHECK-NEXT: vptest %ymm1, %ymm0 2819; CHECK-NEXT: setb %al 2820; CHECK-NEXT: vzeroupper 2821; CHECK-NEXT: ret{{[l|q]}} 2822 %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) 2823 ret i32 %res 2824} 2825declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone 2826 2827define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 2828; CHECK-LABEL: test_mm_testnzc_pd: 2829; CHECK: # %bb.0: 2830; CHECK-NEXT: xorl %eax, %eax 2831; CHECK-NEXT: vtestpd %xmm1, %xmm0 2832; CHECK-NEXT: seta %al 2833; CHECK-NEXT: ret{{[l|q]}} 2834 %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1) 2835 ret i32 %res 2836} 2837declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone 2838 2839define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2840; CHECK-LABEL: test_mm256_testnzc_pd: 2841; CHECK: # %bb.0: 2842; CHECK-NEXT: xorl %eax, %eax 2843; CHECK-NEXT: vtestpd %ymm1, %ymm0 2844; CHECK-NEXT: seta %al 2845; CHECK-NEXT: vzeroupper 2846; CHECK-NEXT: ret{{[l|q]}} 2847 %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) 2848 ret i32 %res 2849} 2850declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone 2851 2852define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2853; CHECK-LABEL: test_mm_testnzc_ps: 2854; CHECK: # %bb.0: 2855; CHECK-NEXT: xorl %eax, %eax 2856; CHECK-NEXT: vtestps %xmm1, %xmm0 2857; CHECK-NEXT: seta %al 2858; CHECK-NEXT: ret{{[l|q]}} 2859 %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1) 2860 ret i32 %res 2861} 2862declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone 2863 2864define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2865; CHECK-LABEL: test_mm256_testnzc_ps: 2866; CHECK: # %bb.0: 2867; CHECK-NEXT: xorl %eax, %eax 2868; CHECK-NEXT: vtestps %ymm1, %ymm0 2869; CHECK-NEXT: seta %al 2870; CHECK-NEXT: vzeroupper 2871; CHECK-NEXT: ret{{[l|q]}} 2872 %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) 2873 ret i32 %res 2874} 2875declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone 2876 2877define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2878; CHECK-LABEL: test_mm256_testnzc_si256: 2879; CHECK: # %bb.0: 2880; CHECK-NEXT: xorl %eax, %eax 2881; CHECK-NEXT: vptest %ymm1, %ymm0 2882; CHECK-NEXT: seta %al 2883; CHECK-NEXT: vzeroupper 2884; CHECK-NEXT: ret{{[l|q]}} 2885 %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) 2886 ret i32 %res 2887} 2888declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone 2889 2890define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind { 2891; CHECK-LABEL: test_mm_testz_pd: 2892; CHECK: # %bb.0: 2893; CHECK-NEXT: xorl %eax, %eax 2894; CHECK-NEXT: vtestpd %xmm1, %xmm0 2895; CHECK-NEXT: sete %al 2896; CHECK-NEXT: ret{{[l|q]}} 2897 %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1) 2898 ret i32 %res 2899} 2900declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone 2901 2902define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2903; CHECK-LABEL: test_mm256_testz_pd: 2904; CHECK: # %bb.0: 2905; CHECK-NEXT: xorl %eax, %eax 2906; CHECK-NEXT: vtestpd %ymm1, %ymm0 2907; CHECK-NEXT: sete %al 2908; CHECK-NEXT: vzeroupper 2909; CHECK-NEXT: ret{{[l|q]}} 2910 %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) 2911 ret i32 %res 2912} 2913declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone 2914 2915define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind { 2916; CHECK-LABEL: test_mm_testz_ps: 2917; CHECK: # %bb.0: 2918; CHECK-NEXT: xorl %eax, %eax 2919; CHECK-NEXT: vtestps %xmm1, %xmm0 2920; CHECK-NEXT: sete %al 2921; CHECK-NEXT: ret{{[l|q]}} 2922 %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1) 2923 ret i32 %res 2924} 2925declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone 2926 2927define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2928; CHECK-LABEL: test_mm256_testz_ps: 2929; CHECK: # %bb.0: 2930; CHECK-NEXT: xorl %eax, %eax 2931; CHECK-NEXT: vtestps %ymm1, %ymm0 2932; CHECK-NEXT: sete %al 2933; CHECK-NEXT: vzeroupper 2934; CHECK-NEXT: ret{{[l|q]}} 2935 %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) 2936 ret i32 %res 2937} 2938declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone 2939 2940define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2941; CHECK-LABEL: test_mm256_testz_si256: 2942; CHECK: # %bb.0: 2943; CHECK-NEXT: xorl %eax, %eax 2944; CHECK-NEXT: vptest %ymm1, %ymm0 2945; CHECK-NEXT: sete %al 2946; CHECK-NEXT: vzeroupper 2947; CHECK-NEXT: ret{{[l|q]}} 2948 %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) 2949 ret i32 %res 2950} 2951declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone 2952 2953define <4 x double> @test_mm256_undefined_pd() nounwind { 2954; CHECK-LABEL: test_mm256_undefined_pd: 2955; CHECK: # %bb.0: 2956; CHECK-NEXT: ret{{[l|q]}} 2957 ret <4 x double> undef 2958} 2959 2960define <8 x float> @test_mm256_undefined_ps() nounwind { 2961; CHECK-LABEL: test_mm256_undefined_ps: 2962; CHECK: # %bb.0: 2963; CHECK-NEXT: ret{{[l|q]}} 2964 ret <8 x float> undef 2965} 2966 2967define <4 x i64> @test_mm256_undefined_si256() nounwind { 2968; CHECK-LABEL: test_mm256_undefined_si256: 2969; CHECK: # %bb.0: 2970; CHECK-NEXT: ret{{[l|q]}} 2971 ret <4 x i64> undef 2972} 2973 2974define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2975; CHECK-LABEL: test_mm256_unpackhi_pd: 2976; CHECK: # %bb.0: 2977; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] 2978; CHECK-NEXT: ret{{[l|q]}} 2979 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 2980 ret <4 x double> %res 2981} 2982 2983define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 2984; CHECK-LABEL: test_mm256_unpackhi_ps: 2985; CHECK: # %bb.0: 2986; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 2987; CHECK-NEXT: ret{{[l|q]}} 2988 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> 2989 ret <8 x float> %res 2990} 2991 2992define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 2993; CHECK-LABEL: test_mm256_unpacklo_pd: 2994; CHECK: # %bb.0: 2995; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] 2996; CHECK-NEXT: ret{{[l|q]}} 2997 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 2998 ret <4 x double> %res 2999} 3000 3001define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 3002; CHECK-LABEL: test_mm256_unpacklo_ps: 3003; CHECK: # %bb.0: 3004; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 3005; CHECK-NEXT: ret{{[l|q]}} 3006 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> 3007 ret <8 x float> %res 3008} 3009 3010define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind { 3011; CHECK-LABEL: test_mm256_xor_pd: 3012; CHECK: # %bb.0: 3013; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 3014; CHECK-NEXT: ret{{[l|q]}} 3015 %1 = bitcast <4 x double> %a0 to <4 x i64> 3016 %2 = bitcast <4 x double> %a1 to <4 x i64> 3017 %res = xor <4 x i64> %1, %2 3018 %bc = bitcast <4 x i64> %res to <4 x double> 3019 ret <4 x double> %bc 3020} 3021 3022define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind { 3023; CHECK-LABEL: test_mm256_xor_ps: 3024; CHECK: # %bb.0: 3025; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 3026; CHECK-NEXT: ret{{[l|q]}} 3027 %1 = bitcast <8 x float> %a0 to <8 x i32> 3028 %2 = bitcast <8 x float> %a1 to <8 x i32> 3029 %res = xor <8 x i32> %1, %2 3030 %bc = bitcast <8 x i32> %res to <8 x float> 3031 ret <8 x float> %bc 3032} 3033 3034define void @test_mm256_zeroall() nounwind { 3035; CHECK-LABEL: test_mm256_zeroall: 3036; CHECK: # %bb.0: 3037; CHECK-NEXT: vzeroall 3038; CHECK-NEXT: ret{{[l|q]}} 3039 call void @llvm.x86.avx.vzeroall() 3040 ret void 3041} 3042declare void @llvm.x86.avx.vzeroall() nounwind readnone 3043 3044define void @test_mm256_zeroupper() nounwind { 3045; CHECK-LABEL: test_mm256_zeroupper: 3046; CHECK: # %bb.0: 3047; CHECK-NEXT: vzeroupper 3048; CHECK-NEXT: ret{{[l|q]}} 3049 call void @llvm.x86.avx.vzeroupper() 3050 ret void 3051} 3052declare void @llvm.x86.avx.vzeroupper() nounwind readnone 3053 3054define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind { 3055; CHECK-LABEL: test_mm256_zextpd128_pd256: 3056; CHECK: # %bb.0: 3057; CHECK-NEXT: vmovaps %xmm0, %xmm0 3058; CHECK-NEXT: ret{{[l|q]}} 3059 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3060 ret <4 x double> %res 3061} 3062 3063define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind { 3064; CHECK-LABEL: test_mm256_zextps128_ps256: 3065; CHECK: # %bb.0: 3066; CHECK-NEXT: vmovaps %xmm0, %xmm0 3067; CHECK-NEXT: ret{{[l|q]}} 3068 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3069 ret <8 x float> %res 3070} 3071 3072define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind { 3073; CHECK-LABEL: test_mm256_zextsi128_si256: 3074; CHECK: # %bb.0: 3075; CHECK-NEXT: vmovaps %xmm0, %xmm0 3076; CHECK-NEXT: ret{{[l|q]}} 3077 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3078 ret <4 x i64> %res 3079} 3080 3081!0 = !{i32 1} 3082