1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X86-SSE 3; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX1 4; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X86-AVX,AVX512 5; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,X64-SSE 6; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1 7; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512 8 9; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse41-builtins.c 10 11define <2 x i64> @test_mm_blend_epi16(<2 x i64> %a0, <2 x i64> %a1) { 12; SSE-LABEL: test_mm_blend_epi16: 13; SSE: # %bb.0: 14; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7] 15; SSE-NEXT: ret{{[l|q]}} 16; 17; AVX-LABEL: test_mm_blend_epi16: 18; AVX: # %bb.0: 19; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7] 20; AVX-NEXT: ret{{[l|q]}} 21 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 22 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 23 %shuf = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7> 24 %res = bitcast <8 x i16> %shuf to <2 x i64> 25 ret <2 x i64> %res 26} 27 28define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) { 29; SSE-LABEL: test_mm_blend_pd: 30; SSE: # %bb.0: 31; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 32; SSE-NEXT: ret{{[l|q]}} 33; 34; AVX-LABEL: test_mm_blend_pd: 35; AVX: # %bb.0: 36; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 37; AVX-NEXT: ret{{[l|q]}} 38 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3> 39 ret <2 x double> %res 40} 41 42define <4 x float> @test_mm_blend_ps(<4 x float> %a0, <4 x float> %a1) { 43; SSE-LABEL: test_mm_blend_ps: 44; SSE: # %bb.0: 45; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 46; SSE-NEXT: ret{{[l|q]}} 47; 48; AVX-LABEL: test_mm_blend_ps: 49; AVX: # %bb.0: 50; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 51; AVX-NEXT: ret{{[l|q]}} 52 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3> 53 ret <4 x float> %res 54} 55 56define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { 57; SSE-LABEL: test_mm_blendv_epi8: 58; SSE: # %bb.0: 59; SSE-NEXT: movdqa %xmm0, %xmm3 60; SSE-NEXT: movaps %xmm2, %xmm0 61; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3 62; SSE-NEXT: movdqa %xmm3, %xmm0 63; SSE-NEXT: ret{{[l|q]}} 64; 65; AVX-LABEL: test_mm_blendv_epi8: 66; AVX: # %bb.0: 67; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 68; AVX-NEXT: ret{{[l|q]}} 69 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 70 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 71 %arg2 = bitcast <2 x i64> %a2 to <16 x i8> 72 %call = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %arg0, <16 x i8> %arg1, <16 x i8> %arg2) 73 %res = bitcast <16 x i8> %call to <2 x i64> 74 ret <2 x i64> %res 75} 76declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone 77 78define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { 79; SSE-LABEL: test_mm_blendv_pd: 80; SSE: # %bb.0: 81; SSE-NEXT: movapd %xmm0, %xmm3 82; SSE-NEXT: movaps %xmm2, %xmm0 83; SSE-NEXT: blendvpd %xmm0, %xmm1, %xmm3 84; SSE-NEXT: movapd %xmm3, %xmm0 85; SSE-NEXT: ret{{[l|q]}} 86; 87; AVX-LABEL: test_mm_blendv_pd: 88; AVX: # %bb.0: 89; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 90; AVX-NEXT: ret{{[l|q]}} 91 %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) 92 ret <2 x double> %res 93} 94declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone 95 96define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { 97; SSE-LABEL: test_mm_blendv_ps: 98; SSE: # %bb.0: 99; SSE-NEXT: movaps %xmm0, %xmm3 100; SSE-NEXT: movaps %xmm2, %xmm0 101; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3 102; SSE-NEXT: movaps %xmm3, %xmm0 103; SSE-NEXT: ret{{[l|q]}} 104; 105; AVX-LABEL: test_mm_blendv_ps: 106; AVX: # %bb.0: 107; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 108; AVX-NEXT: ret{{[l|q]}} 109 %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) 110 ret <4 x float> %res 111} 112declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 113 114define <2 x double> @test_mm_ceil_pd(<2 x double> %a0) { 115; SSE-LABEL: test_mm_ceil_pd: 116; SSE: # %bb.0: 117; SSE-NEXT: roundpd $2, %xmm0, %xmm0 118; SSE-NEXT: ret{{[l|q]}} 119; 120; AVX-LABEL: test_mm_ceil_pd: 121; AVX: # %bb.0: 122; AVX-NEXT: vroundpd $2, %xmm0, %xmm0 123; AVX-NEXT: ret{{[l|q]}} 124 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 2) 125 ret <2 x double> %res 126} 127declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone 128 129define <4 x float> @test_mm_ceil_ps(<4 x float> %a0) { 130; SSE-LABEL: test_mm_ceil_ps: 131; SSE: # %bb.0: 132; SSE-NEXT: roundps $2, %xmm0, %xmm0 133; SSE-NEXT: ret{{[l|q]}} 134; 135; AVX-LABEL: test_mm_ceil_ps: 136; AVX: # %bb.0: 137; AVX-NEXT: vroundps $2, %xmm0, %xmm0 138; AVX-NEXT: ret{{[l|q]}} 139 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 2) 140 ret <4 x float> %res 141} 142declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone 143 144define <2 x double> @test_mm_ceil_sd(<2 x double> %a0, <2 x double> %a1) { 145; SSE-LABEL: test_mm_ceil_sd: 146; SSE: # %bb.0: 147; SSE-NEXT: roundsd $2, %xmm1, %xmm0 148; SSE-NEXT: ret{{[l|q]}} 149; 150; AVX-LABEL: test_mm_ceil_sd: 151; AVX: # %bb.0: 152; AVX-NEXT: vroundsd $2, %xmm1, %xmm0, %xmm0 153; AVX-NEXT: ret{{[l|q]}} 154 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 2) 155 ret <2 x double> %res 156} 157declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone 158 159define <4 x float> @test_mm_ceil_ss(<4 x float> %a0, <4 x float> %a1) { 160; SSE-LABEL: test_mm_ceil_ss: 161; SSE: # %bb.0: 162; SSE-NEXT: roundss $2, %xmm1, %xmm0 163; SSE-NEXT: ret{{[l|q]}} 164; 165; AVX-LABEL: test_mm_ceil_ss: 166; AVX: # %bb.0: 167; AVX-NEXT: vroundss $2, %xmm1, %xmm0, %xmm0 168; AVX-NEXT: ret{{[l|q]}} 169 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 2) 170 ret <4 x float> %res 171} 172declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone 173 174define <2 x i64> @test_mm_cmpeq_epi64(<2 x i64> %a0, <2 x i64> %a1) { 175; SSE-LABEL: test_mm_cmpeq_epi64: 176; SSE: # %bb.0: 177; SSE-NEXT: pcmpeqq %xmm1, %xmm0 178; SSE-NEXT: ret{{[l|q]}} 179; 180; AVX1-LABEL: test_mm_cmpeq_epi64: 181; AVX1: # %bb.0: 182; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 183; AVX1-NEXT: ret{{[l|q]}} 184; 185; AVX512-LABEL: test_mm_cmpeq_epi64: 186; AVX512: # %bb.0: 187; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 188; AVX512-NEXT: vpmovm2q %k0, %xmm0 189; AVX512-NEXT: ret{{[l|q]}} 190 %cmp = icmp eq <2 x i64> %a0, %a1 191 %res = sext <2 x i1> %cmp to <2 x i64> 192 ret <2 x i64> %res 193} 194 195define <2 x i64> @test_mm_cvtepi8_epi16(<2 x i64> %a0) { 196; SSE-LABEL: test_mm_cvtepi8_epi16: 197; SSE: # %bb.0: 198; SSE-NEXT: pmovsxbw %xmm0, %xmm0 199; SSE-NEXT: ret{{[l|q]}} 200; 201; AVX-LABEL: test_mm_cvtepi8_epi16: 202; AVX: # %bb.0: 203; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 204; AVX-NEXT: ret{{[l|q]}} 205 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 206 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 207 %sext = sext <8 x i8> %ext0 to <8 x i16> 208 %res = bitcast <8 x i16> %sext to <2 x i64> 209 ret <2 x i64> %res 210} 211 212define <2 x i64> @test_mm_cvtepi8_epi32(<2 x i64> %a0) { 213; SSE-LABEL: test_mm_cvtepi8_epi32: 214; SSE: # %bb.0: 215; SSE-NEXT: pmovsxbd %xmm0, %xmm0 216; SSE-NEXT: ret{{[l|q]}} 217; 218; AVX-LABEL: test_mm_cvtepi8_epi32: 219; AVX: # %bb.0: 220; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 221; AVX-NEXT: ret{{[l|q]}} 222 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 223 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 224 %sext = sext <4 x i8> %ext0 to <4 x i32> 225 %res = bitcast <4 x i32> %sext to <2 x i64> 226 ret <2 x i64> %res 227} 228 229define <2 x i64> @test_mm_cvtepi8_epi64(<2 x i64> %a0) { 230; SSE-LABEL: test_mm_cvtepi8_epi64: 231; SSE: # %bb.0: 232; SSE-NEXT: pmovsxbq %xmm0, %xmm0 233; SSE-NEXT: ret{{[l|q]}} 234; 235; AVX-LABEL: test_mm_cvtepi8_epi64: 236; AVX: # %bb.0: 237; AVX-NEXT: vpmovsxbq %xmm0, %xmm0 238; AVX-NEXT: ret{{[l|q]}} 239 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 240 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 241 %sext = sext <2 x i8> %ext0 to <2 x i64> 242 ret <2 x i64> %sext 243} 244 245define <2 x i64> @test_mm_cvtepi16_epi32(<2 x i64> %a0) { 246; SSE-LABEL: test_mm_cvtepi16_epi32: 247; SSE: # %bb.0: 248; SSE-NEXT: pmovsxwd %xmm0, %xmm0 249; SSE-NEXT: ret{{[l|q]}} 250; 251; AVX-LABEL: test_mm_cvtepi16_epi32: 252; AVX: # %bb.0: 253; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 254; AVX-NEXT: ret{{[l|q]}} 255 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 256 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 257 %sext = sext <4 x i16> %ext0 to <4 x i32> 258 %res = bitcast <4 x i32> %sext to <2 x i64> 259 ret <2 x i64> %res 260} 261 262define <2 x i64> @test_mm_cvtepi16_epi64(<2 x i64> %a0) { 263; SSE-LABEL: test_mm_cvtepi16_epi64: 264; SSE: # %bb.0: 265; SSE-NEXT: pmovsxwq %xmm0, %xmm0 266; SSE-NEXT: ret{{[l|q]}} 267; 268; AVX-LABEL: test_mm_cvtepi16_epi64: 269; AVX: # %bb.0: 270; AVX-NEXT: vpmovsxwq %xmm0, %xmm0 271; AVX-NEXT: ret{{[l|q]}} 272 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 273 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 274 %sext = sext <2 x i16> %ext0 to <2 x i64> 275 ret <2 x i64> %sext 276} 277 278define <2 x i64> @test_mm_cvtepi32_epi64(<2 x i64> %a0) { 279; SSE-LABEL: test_mm_cvtepi32_epi64: 280; SSE: # %bb.0: 281; SSE-NEXT: pmovsxdq %xmm0, %xmm0 282; SSE-NEXT: ret{{[l|q]}} 283; 284; AVX-LABEL: test_mm_cvtepi32_epi64: 285; AVX: # %bb.0: 286; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 287; AVX-NEXT: ret{{[l|q]}} 288 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 289 %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 290 %sext = sext <2 x i32> %ext0 to <2 x i64> 291 ret <2 x i64> %sext 292} 293 294define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) { 295; SSE-LABEL: test_mm_cvtepu8_epi16: 296; SSE: # %bb.0: 297; SSE-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 298; SSE-NEXT: ret{{[l|q]}} 299; 300; AVX-LABEL: test_mm_cvtepu8_epi16: 301; AVX: # %bb.0: 302; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 303; AVX-NEXT: ret{{[l|q]}} 304 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 305 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 306 %sext = zext <8 x i8> %ext0 to <8 x i16> 307 %res = bitcast <8 x i16> %sext to <2 x i64> 308 ret <2 x i64> %res 309} 310 311define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) { 312; SSE-LABEL: test_mm_cvtepu8_epi32: 313; SSE: # %bb.0: 314; SSE-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 315; SSE-NEXT: ret{{[l|q]}} 316; 317; AVX-LABEL: test_mm_cvtepu8_epi32: 318; AVX: # %bb.0: 319; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 320; AVX-NEXT: ret{{[l|q]}} 321 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 322 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 323 %sext = zext <4 x i8> %ext0 to <4 x i32> 324 %res = bitcast <4 x i32> %sext to <2 x i64> 325 ret <2 x i64> %res 326} 327 328define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) { 329; SSE-LABEL: test_mm_cvtepu8_epi64: 330; SSE: # %bb.0: 331; SSE-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 332; SSE-NEXT: ret{{[l|q]}} 333; 334; AVX-LABEL: test_mm_cvtepu8_epi64: 335; AVX: # %bb.0: 336; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero 337; AVX-NEXT: ret{{[l|q]}} 338 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 339 %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1> 340 %sext = zext <2 x i8> %ext0 to <2 x i64> 341 ret <2 x i64> %sext 342} 343 344define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) { 345; SSE-LABEL: test_mm_cvtepu16_epi32: 346; SSE: # %bb.0: 347; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 348; SSE-NEXT: ret{{[l|q]}} 349; 350; AVX-LABEL: test_mm_cvtepu16_epi32: 351; AVX: # %bb.0: 352; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 353; AVX-NEXT: ret{{[l|q]}} 354 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 355 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 356 %sext = zext <4 x i16> %ext0 to <4 x i32> 357 %res = bitcast <4 x i32> %sext to <2 x i64> 358 ret <2 x i64> %res 359} 360 361define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) { 362; SSE-LABEL: test_mm_cvtepu16_epi64: 363; SSE: # %bb.0: 364; SSE-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 365; SSE-NEXT: ret{{[l|q]}} 366; 367; AVX-LABEL: test_mm_cvtepu16_epi64: 368; AVX: # %bb.0: 369; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 370; AVX-NEXT: ret{{[l|q]}} 371 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 372 %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1> 373 %sext = zext <2 x i16> %ext0 to <2 x i64> 374 ret <2 x i64> %sext 375} 376 377define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) { 378; SSE-LABEL: test_mm_cvtepu32_epi64: 379; SSE: # %bb.0: 380; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 381; SSE-NEXT: ret{{[l|q]}} 382; 383; AVX-LABEL: test_mm_cvtepu32_epi64: 384; AVX: # %bb.0: 385; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 386; AVX-NEXT: ret{{[l|q]}} 387 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 388 %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 389 %sext = zext <2 x i32> %ext0 to <2 x i64> 390 ret <2 x i64> %sext 391} 392 393define <2 x double> @test_mm_dp_pd(<2 x double> %a0, <2 x double> %a1) { 394; SSE-LABEL: test_mm_dp_pd: 395; SSE: # %bb.0: 396; SSE-NEXT: dppd $7, %xmm1, %xmm0 397; SSE-NEXT: ret{{[l|q]}} 398; 399; AVX-LABEL: test_mm_dp_pd: 400; AVX: # %bb.0: 401; AVX-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 402; AVX-NEXT: ret{{[l|q]}} 403 %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) 404 ret <2 x double> %res 405} 406declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone 407 408define <4 x float> @test_mm_dp_ps(<4 x float> %a0, <4 x float> %a1) { 409; SSE-LABEL: test_mm_dp_ps: 410; SSE: # %bb.0: 411; SSE-NEXT: dpps $7, %xmm1, %xmm0 412; SSE-NEXT: ret{{[l|q]}} 413; 414; AVX-LABEL: test_mm_dp_ps: 415; AVX: # %bb.0: 416; AVX-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 417; AVX-NEXT: ret{{[l|q]}} 418 %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) 419 ret <4 x float> %res 420} 421declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone 422 423define i32 @test_mm_extract_epi8(<2 x i64> %a0) { 424; SSE-LABEL: test_mm_extract_epi8: 425; SSE: # %bb.0: 426; SSE-NEXT: pextrb $1, %xmm0, %eax 427; SSE-NEXT: movzbl %al, %eax 428; SSE-NEXT: ret{{[l|q]}} 429; 430; AVX-LABEL: test_mm_extract_epi8: 431; AVX: # %bb.0: 432; AVX-NEXT: vpextrb $1, %xmm0, %eax 433; AVX-NEXT: movzbl %al, %eax 434; AVX-NEXT: ret{{[l|q]}} 435 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 436 %ext = extractelement <16 x i8> %arg0, i32 1 437 %res = zext i8 %ext to i32 438 ret i32 %res 439} 440 441define i32 @test_mm_extract_epi32(<2 x i64> %a0) { 442; SSE-LABEL: test_mm_extract_epi32: 443; SSE: # %bb.0: 444; SSE-NEXT: extractps $1, %xmm0, %eax 445; SSE-NEXT: ret{{[l|q]}} 446; 447; AVX-LABEL: test_mm_extract_epi32: 448; AVX: # %bb.0: 449; AVX-NEXT: vextractps $1, %xmm0, %eax 450; AVX-NEXT: ret{{[l|q]}} 451 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 452 %ext = extractelement <4 x i32> %arg0, i32 1 453 ret i32 %ext 454} 455 456define i64 @test_mm_extract_epi64(<2 x i64> %a0) { 457; X86-SSE-LABEL: test_mm_extract_epi64: 458; X86-SSE: # %bb.0: 459; X86-SSE-NEXT: extractps $2, %xmm0, %eax 460; X86-SSE-NEXT: extractps $3, %xmm0, %edx 461; X86-SSE-NEXT: retl 462; 463; X86-AVX-LABEL: test_mm_extract_epi64: 464; X86-AVX: # %bb.0: 465; X86-AVX-NEXT: vextractps $2, %xmm0, %eax 466; X86-AVX-NEXT: vextractps $3, %xmm0, %edx 467; X86-AVX-NEXT: retl 468; 469; X64-SSE-LABEL: test_mm_extract_epi64: 470; X64-SSE: # %bb.0: 471; X64-SSE-NEXT: pextrq $1, %xmm0, %rax 472; X64-SSE-NEXT: retq 473; 474; X64-AVX-LABEL: test_mm_extract_epi64: 475; X64-AVX: # %bb.0: 476; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax 477; X64-AVX-NEXT: retq 478 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 479 %ext = extractelement <2 x i64> %a0, i32 1 480 ret i64 %ext 481} 482 483define i32 @test_mm_extract_ps(<4 x float> %a0) { 484; SSE-LABEL: test_mm_extract_ps: 485; SSE: # %bb.0: 486; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 487; SSE-NEXT: movd %xmm0, %eax 488; SSE-NEXT: ret{{[l|q]}} 489; 490; AVX-LABEL: test_mm_extract_ps: 491; AVX: # %bb.0: 492; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 493; AVX-NEXT: vmovd %xmm0, %eax 494; AVX-NEXT: ret{{[l|q]}} 495 %ext = extractelement <4 x float> %a0, i32 1 496 %bc = bitcast float %ext to i32 497 ret i32 %bc 498} 499 500define <2 x double> @test_mm_floor_pd(<2 x double> %a0) { 501; SSE-LABEL: test_mm_floor_pd: 502; SSE: # %bb.0: 503; SSE-NEXT: roundpd $1, %xmm0, %xmm0 504; SSE-NEXT: ret{{[l|q]}} 505; 506; AVX-LABEL: test_mm_floor_pd: 507; AVX: # %bb.0: 508; AVX-NEXT: vroundpd $1, %xmm0, %xmm0 509; AVX-NEXT: ret{{[l|q]}} 510 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 1) 511 ret <2 x double> %res 512} 513 514define <4 x float> @test_mm_floor_ps(<4 x float> %a0) { 515; SSE-LABEL: test_mm_floor_ps: 516; SSE: # %bb.0: 517; SSE-NEXT: roundps $1, %xmm0, %xmm0 518; SSE-NEXT: ret{{[l|q]}} 519; 520; AVX-LABEL: test_mm_floor_ps: 521; AVX: # %bb.0: 522; AVX-NEXT: vroundps $1, %xmm0, %xmm0 523; AVX-NEXT: ret{{[l|q]}} 524 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 1) 525 ret <4 x float> %res 526} 527 528define <2 x double> @test_mm_floor_sd(<2 x double> %a0, <2 x double> %a1) { 529; SSE-LABEL: test_mm_floor_sd: 530; SSE: # %bb.0: 531; SSE-NEXT: roundsd $1, %xmm1, %xmm0 532; SSE-NEXT: ret{{[l|q]}} 533; 534; AVX-LABEL: test_mm_floor_sd: 535; AVX: # %bb.0: 536; AVX-NEXT: vroundsd $1, %xmm1, %xmm0, %xmm0 537; AVX-NEXT: ret{{[l|q]}} 538 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 1) 539 ret <2 x double> %res 540} 541 542define <4 x float> @test_mm_floor_ss(<4 x float> %a0, <4 x float> %a1) { 543; SSE-LABEL: test_mm_floor_ss: 544; SSE: # %bb.0: 545; SSE-NEXT: roundss $1, %xmm1, %xmm0 546; SSE-NEXT: ret{{[l|q]}} 547; 548; AVX-LABEL: test_mm_floor_ss: 549; AVX: # %bb.0: 550; AVX-NEXT: vroundss $1, %xmm1, %xmm0, %xmm0 551; AVX-NEXT: ret{{[l|q]}} 552 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 1) 553 ret <4 x float> %res 554} 555 556define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) { 557; X86-SSE-LABEL: test_mm_insert_epi8: 558; X86-SSE: # %bb.0: 559; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %eax 560; X86-SSE-NEXT: pinsrb $1, %eax, %xmm0 561; X86-SSE-NEXT: retl 562; 563; X86-AVX-LABEL: test_mm_insert_epi8: 564; X86-AVX: # %bb.0: 565; X86-AVX-NEXT: movzbl {{[0-9]+}}(%esp), %eax 566; X86-AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 567; X86-AVX-NEXT: retl 568; 569; X64-SSE-LABEL: test_mm_insert_epi8: 570; X64-SSE: # %bb.0: 571; X64-SSE-NEXT: movzbl %dil, %eax 572; X64-SSE-NEXT: pinsrb $1, %eax, %xmm0 573; X64-SSE-NEXT: retq 574; 575; X64-AVX-LABEL: test_mm_insert_epi8: 576; X64-AVX: # %bb.0: 577; X64-AVX-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 578; X64-AVX-NEXT: retq 579 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 580 %res = insertelement <16 x i8> %arg0, i8 %a1,i32 1 581 %bc = bitcast <16 x i8> %res to <2 x i64> 582 ret <2 x i64> %bc 583} 584 585define <2 x i64> @test_mm_insert_epi32(<2 x i64> %a0, i32 %a1) { 586; X86-SSE-LABEL: test_mm_insert_epi32: 587; X86-SSE: # %bb.0: 588; X86-SSE-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 589; X86-SSE-NEXT: retl 590; 591; X86-AVX-LABEL: test_mm_insert_epi32: 592; X86-AVX: # %bb.0: 593; X86-AVX-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 594; X86-AVX-NEXT: retl 595; 596; X64-SSE-LABEL: test_mm_insert_epi32: 597; X64-SSE: # %bb.0: 598; X64-SSE-NEXT: pinsrd $1, %edi, %xmm0 599; X64-SSE-NEXT: retq 600; 601; X64-AVX-LABEL: test_mm_insert_epi32: 602; X64-AVX: # %bb.0: 603; X64-AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 604; X64-AVX-NEXT: retq 605 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 606 %res = insertelement <4 x i32> %arg0, i32 %a1,i32 1 607 %bc = bitcast <4 x i32> %res to <2 x i64> 608 ret <2 x i64> %bc 609} 610 611define <2 x i64> @test_mm_insert_epi64(<2 x i64> %a0, i64 %a1) { 612; X86-SSE-LABEL: test_mm_insert_epi64: 613; X86-SSE: # %bb.0: 614; X86-SSE-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0 615; X86-SSE-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0 616; X86-SSE-NEXT: retl 617; 618; X86-AVX-LABEL: test_mm_insert_epi64: 619; X86-AVX: # %bb.0: 620; X86-AVX-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 621; X86-AVX-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 622; X86-AVX-NEXT: retl 623; 624; X64-SSE-LABEL: test_mm_insert_epi64: 625; X64-SSE: # %bb.0: 626; X64-SSE-NEXT: pinsrq $1, %rdi, %xmm0 627; X64-SSE-NEXT: retq 628; 629; X64-AVX-LABEL: test_mm_insert_epi64: 630; X64-AVX: # %bb.0: 631; X64-AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 632; X64-AVX-NEXT: retq 633 %res = insertelement <2 x i64> %a0, i64 %a1,i32 1 634 ret <2 x i64> %res 635} 636 637define <4 x float> @test_mm_insert_ps(<4 x float> %a0, <4 x float> %a1) { 638; SSE-LABEL: test_mm_insert_ps: 639; SSE: # %bb.0: 640; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3] 641; SSE-NEXT: ret{{[l|q]}} 642; 643; AVX-LABEL: test_mm_insert_ps: 644; AVX: # %bb.0: 645; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3] 646; AVX-NEXT: ret{{[l|q]}} 647 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 4) 648 ret <4 x float> %res 649} 650declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone 651 652define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) { 653; SSE-LABEL: test_mm_max_epi8: 654; SSE: # %bb.0: 655; SSE-NEXT: pmaxsb %xmm1, %xmm0 656; SSE-NEXT: ret{{[l|q]}} 657; 658; AVX-LABEL: test_mm_max_epi8: 659; AVX: # %bb.0: 660; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 661; AVX-NEXT: ret{{[l|q]}} 662 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 663 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 664 %sel = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %arg0, <16 x i8> %arg1) 665 %bc = bitcast <16 x i8> %sel to <2 x i64> 666 ret <2 x i64> %bc 667} 668declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) 669 670define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) { 671; SSE-LABEL: test_mm_max_epi32: 672; SSE: # %bb.0: 673; SSE-NEXT: pmaxsd %xmm1, %xmm0 674; SSE-NEXT: ret{{[l|q]}} 675; 676; AVX-LABEL: test_mm_max_epi32: 677; AVX: # %bb.0: 678; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 679; AVX-NEXT: ret{{[l|q]}} 680 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 681 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 682 %sel = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %arg0, <4 x i32> %arg1) 683 %bc = bitcast <4 x i32> %sel to <2 x i64> 684 ret <2 x i64> %bc 685} 686declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) 687 688define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) { 689; SSE-LABEL: test_mm_max_epu16: 690; SSE: # %bb.0: 691; SSE-NEXT: pmaxuw %xmm1, %xmm0 692; SSE-NEXT: ret{{[l|q]}} 693; 694; AVX-LABEL: test_mm_max_epu16: 695; AVX: # %bb.0: 696; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 697; AVX-NEXT: ret{{[l|q]}} 698 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 699 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 700 %sel = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %arg0, <8 x i16> %arg1) 701 %bc = bitcast <8 x i16> %sel to <2 x i64> 702 ret <2 x i64> %bc 703} 704declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) 705 706define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) { 707; SSE-LABEL: test_mm_max_epu32: 708; SSE: # %bb.0: 709; SSE-NEXT: pmaxud %xmm1, %xmm0 710; SSE-NEXT: ret{{[l|q]}} 711; 712; AVX-LABEL: test_mm_max_epu32: 713; AVX: # %bb.0: 714; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 715; AVX-NEXT: ret{{[l|q]}} 716 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 717 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 718 %sel = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %arg0, <4 x i32> %arg1) 719 %bc = bitcast <4 x i32> %sel to <2 x i64> 720 ret <2 x i64> %bc 721} 722declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) 723 724define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) { 725; SSE-LABEL: test_mm_min_epi8: 726; SSE: # %bb.0: 727; SSE-NEXT: pminsb %xmm1, %xmm0 728; SSE-NEXT: ret{{[l|q]}} 729; 730; AVX-LABEL: test_mm_min_epi8: 731; AVX: # %bb.0: 732; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 733; AVX-NEXT: ret{{[l|q]}} 734 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 735 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 736 %sel = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %arg0, <16 x i8> %arg1) 737 %bc = bitcast <16 x i8> %sel to <2 x i64> 738 ret <2 x i64> %bc 739} 740declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) 741 742define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) { 743; SSE-LABEL: test_mm_min_epi32: 744; SSE: # %bb.0: 745; SSE-NEXT: pminsd %xmm1, %xmm0 746; SSE-NEXT: ret{{[l|q]}} 747; 748; AVX-LABEL: test_mm_min_epi32: 749; AVX: # %bb.0: 750; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 751; AVX-NEXT: ret{{[l|q]}} 752 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 753 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 754 %sel = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %arg0, <4 x i32> %arg1) 755 %bc = bitcast <4 x i32> %sel to <2 x i64> 756 ret <2 x i64> %bc 757} 758declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) 759 760define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) { 761; SSE-LABEL: test_mm_min_epu16: 762; SSE: # %bb.0: 763; SSE-NEXT: pminuw %xmm1, %xmm0 764; SSE-NEXT: ret{{[l|q]}} 765; 766; AVX-LABEL: test_mm_min_epu16: 767; AVX: # %bb.0: 768; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 769; AVX-NEXT: ret{{[l|q]}} 770 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 771 %arg1 = bitcast <2 x i64> %a1 to <8 x i16> 772 %sel = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %arg0, <8 x i16> %arg1) 773 %bc = bitcast <8 x i16> %sel to <2 x i64> 774 ret <2 x i64> %bc 775} 776declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) 777 778define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) { 779; SSE-LABEL: test_mm_min_epu32: 780; SSE: # %bb.0: 781; SSE-NEXT: pminud %xmm1, %xmm0 782; SSE-NEXT: ret{{[l|q]}} 783; 784; AVX-LABEL: test_mm_min_epu32: 785; AVX: # %bb.0: 786; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 787; AVX-NEXT: ret{{[l|q]}} 788 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 789 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 790 %sel = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %arg0, <4 x i32> %arg1) 791 %bc = bitcast <4 x i32> %sel to <2 x i64> 792 ret <2 x i64> %bc 793} 794declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) 795 796define <2 x i64> @test_mm_minpos_epu16(<2 x i64> %a0) { 797; SSE-LABEL: test_mm_minpos_epu16: 798; SSE: # %bb.0: 799; SSE-NEXT: phminposuw %xmm0, %xmm0 800; SSE-NEXT: ret{{[l|q]}} 801; 802; AVX-LABEL: test_mm_minpos_epu16: 803; AVX: # %bb.0: 804; AVX-NEXT: vphminposuw %xmm0, %xmm0 805; AVX-NEXT: ret{{[l|q]}} 806 %arg0 = bitcast <2 x i64> %a0 to <8 x i16> 807 %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %arg0) 808 %bc = bitcast <8 x i16> %res to <2 x i64> 809 ret <2 x i64> %bc 810} 811declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone 812 813define <2 x i64> @test_mm_mpsadbw_epu8(<2 x i64> %a0, <2 x i64> %a1) { 814; SSE-LABEL: test_mm_mpsadbw_epu8: 815; SSE: # %bb.0: 816; SSE-NEXT: mpsadbw $1, %xmm1, %xmm0 817; SSE-NEXT: ret{{[l|q]}} 818; 819; AVX-LABEL: test_mm_mpsadbw_epu8: 820; AVX: # %bb.0: 821; AVX-NEXT: vmpsadbw $1, %xmm1, %xmm0, %xmm0 822; AVX-NEXT: ret{{[l|q]}} 823 %arg0 = bitcast <2 x i64> %a0 to <16 x i8> 824 %arg1 = bitcast <2 x i64> %a1 to <16 x i8> 825 %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %arg0, <16 x i8> %arg1, i8 1) 826 %bc = bitcast <8 x i16> %res to <2 x i64> 827 ret <2 x i64> %bc 828} 829declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone 830 831define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) { 832; SSE-LABEL: test_mm_mul_epi32: 833; SSE: # %bb.0: 834; SSE-NEXT: pmuldq %xmm1, %xmm0 835; SSE-NEXT: ret{{[l|q]}} 836; 837; AVX1-LABEL: test_mm_mul_epi32: 838; AVX1: # %bb.0: 839; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 840; AVX1-NEXT: ret{{[l|q]}} 841; 842; AVX512-LABEL: test_mm_mul_epi32: 843; AVX512: # %bb.0: 844; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0 845; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0 846; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 847; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1 848; AVX512-NEXT: vpmullq %xmm1, %xmm0, %xmm0 849; AVX512-NEXT: ret{{[l|q]}} 850 %A = shl <2 x i64> %a0, <i64 32, i64 32> 851 %A1 = ashr exact <2 x i64> %A, <i64 32, i64 32> 852 %B = shl <2 x i64> %a1, <i64 32, i64 32> 853 %B1 = ashr exact <2 x i64> %B, <i64 32, i64 32> 854 %res = mul nsw <2 x i64> %A1, %B1 855 ret <2 x i64> %res 856} 857 858define <2 x i64> @test_mm_mullo_epi32(<2 x i64> %a0, <2 x i64> %a1) { 859; SSE-LABEL: test_mm_mullo_epi32: 860; SSE: # %bb.0: 861; SSE-NEXT: pmulld %xmm1, %xmm0 862; SSE-NEXT: ret{{[l|q]}} 863; 864; AVX-LABEL: test_mm_mullo_epi32: 865; AVX: # %bb.0: 866; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 867; AVX-NEXT: ret{{[l|q]}} 868 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 869 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 870 %res = mul <4 x i32> %arg0, %arg1 871 %bc = bitcast <4 x i32> %res to <2 x i64> 872 ret <2 x i64> %bc 873} 874 875define <2 x i64> @test_mm_packus_epi32(<2 x i64> %a0, <2 x i64> %a1) { 876; SSE-LABEL: test_mm_packus_epi32: 877; SSE: # %bb.0: 878; SSE-NEXT: packusdw %xmm1, %xmm0 879; SSE-NEXT: ret{{[l|q]}} 880; 881; AVX-LABEL: test_mm_packus_epi32: 882; AVX: # %bb.0: 883; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 884; AVX-NEXT: ret{{[l|q]}} 885 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 886 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 887 %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %arg0, <4 x i32> %arg1) 888 %bc = bitcast <8 x i16> %res to <2 x i64> 889 ret <2 x i64> %bc 890} 891declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone 892 893define <2 x double> @test_mm_round_pd(<2 x double> %a0) { 894; SSE-LABEL: test_mm_round_pd: 895; SSE: # %bb.0: 896; SSE-NEXT: roundpd $4, %xmm0, %xmm0 897; SSE-NEXT: ret{{[l|q]}} 898; 899; AVX-LABEL: test_mm_round_pd: 900; AVX: # %bb.0: 901; AVX-NEXT: vroundpd $4, %xmm0, %xmm0 902; AVX-NEXT: ret{{[l|q]}} 903 %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4) 904 ret <2 x double> %res 905} 906 907define <4 x float> @test_mm_round_ps(<4 x float> %a0) { 908; SSE-LABEL: test_mm_round_ps: 909; SSE: # %bb.0: 910; SSE-NEXT: roundps $4, %xmm0, %xmm0 911; SSE-NEXT: ret{{[l|q]}} 912; 913; AVX-LABEL: test_mm_round_ps: 914; AVX: # %bb.0: 915; AVX-NEXT: vroundps $4, %xmm0, %xmm0 916; AVX-NEXT: ret{{[l|q]}} 917 %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4) 918 ret <4 x float> %res 919} 920 921define <2 x double> @test_mm_round_sd(<2 x double> %a0, <2 x double> %a1) { 922; SSE-LABEL: test_mm_round_sd: 923; SSE: # %bb.0: 924; SSE-NEXT: roundsd $4, %xmm1, %xmm0 925; SSE-NEXT: ret{{[l|q]}} 926; 927; AVX-LABEL: test_mm_round_sd: 928; AVX: # %bb.0: 929; AVX-NEXT: vroundsd $4, %xmm1, %xmm0, %xmm0 930; AVX-NEXT: ret{{[l|q]}} 931 %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 4) 932 ret <2 x double> %res 933} 934 935define <4 x float> @test_mm_round_ss(<4 x float> %a0, <4 x float> %a1) { 936; SSE-LABEL: test_mm_round_ss: 937; SSE: # %bb.0: 938; SSE-NEXT: roundss $4, %xmm1, %xmm0 939; SSE-NEXT: ret{{[l|q]}} 940; 941; AVX-LABEL: test_mm_round_ss: 942; AVX: # %bb.0: 943; AVX-NEXT: vroundss $4, %xmm1, %xmm0, %xmm0 944; AVX-NEXT: ret{{[l|q]}} 945 %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 4) 946 ret <4 x float> %res 947} 948 949define <2 x i64> @test_mm_stream_load_si128(ptr %a0) { 950; X86-SSE-LABEL: test_mm_stream_load_si128: 951; X86-SSE: # %bb.0: 952; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 953; X86-SSE-NEXT: movntdqa (%eax), %xmm0 954; X86-SSE-NEXT: retl 955; 956; X86-AVX-LABEL: test_mm_stream_load_si128: 957; X86-AVX: # %bb.0: 958; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 959; X86-AVX-NEXT: vmovntdqa (%eax), %xmm0 960; X86-AVX-NEXT: retl 961; 962; X64-SSE-LABEL: test_mm_stream_load_si128: 963; X64-SSE: # %bb.0: 964; X64-SSE-NEXT: movntdqa (%rdi), %xmm0 965; X64-SSE-NEXT: retq 966; 967; X64-AVX-LABEL: test_mm_stream_load_si128: 968; X64-AVX: # %bb.0: 969; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0 970; X64-AVX-NEXT: retq 971 %res = call <2 x i64> @llvm.x86.sse41.movntdqa(ptr %a0) 972 ret <2 x i64> %res 973} 974declare <2 x i64> @llvm.x86.sse41.movntdqa(ptr) nounwind readnone 975 976define i32 @test_mm_test_all_ones(<2 x i64> %a0) { 977; SSE-LABEL: test_mm_test_all_ones: 978; SSE: # %bb.0: 979; SSE-NEXT: pcmpeqd %xmm1, %xmm1 980; SSE-NEXT: xorl %eax, %eax 981; SSE-NEXT: ptest %xmm1, %xmm0 982; SSE-NEXT: setb %al 983; SSE-NEXT: ret{{[l|q]}} 984; 985; AVX-LABEL: test_mm_test_all_ones: 986; AVX: # %bb.0: 987; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 988; AVX-NEXT: xorl %eax, %eax 989; AVX-NEXT: vptest %xmm1, %xmm0 990; AVX-NEXT: setb %al 991; AVX-NEXT: ret{{[l|q]}} 992 %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> <i64 -1, i64 -1>) 993 ret i32 %res 994} 995declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone 996 997define i32 @test_mm_test_all_zeros(<2 x i64> %a0, <2 x i64> %a1) { 998; SSE-LABEL: test_mm_test_all_zeros: 999; SSE: # %bb.0: 1000; SSE-NEXT: xorl %eax, %eax 1001; SSE-NEXT: ptest %xmm1, %xmm0 1002; SSE-NEXT: sete %al 1003; SSE-NEXT: ret{{[l|q]}} 1004; 1005; AVX-LABEL: test_mm_test_all_zeros: 1006; AVX: # %bb.0: 1007; AVX-NEXT: xorl %eax, %eax 1008; AVX-NEXT: vptest %xmm1, %xmm0 1009; AVX-NEXT: sete %al 1010; AVX-NEXT: ret{{[l|q]}} 1011 %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) 1012 ret i32 %res 1013} 1014declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone 1015 1016define i32 @test_mm_test_mix_ones_zeros(<2 x i64> %a0, <2 x i64> %a1) { 1017; SSE-LABEL: test_mm_test_mix_ones_zeros: 1018; SSE: # %bb.0: 1019; SSE-NEXT: xorl %eax, %eax 1020; SSE-NEXT: ptest %xmm1, %xmm0 1021; SSE-NEXT: seta %al 1022; SSE-NEXT: ret{{[l|q]}} 1023; 1024; AVX-LABEL: test_mm_test_mix_ones_zeros: 1025; AVX: # %bb.0: 1026; AVX-NEXT: xorl %eax, %eax 1027; AVX-NEXT: vptest %xmm1, %xmm0 1028; AVX-NEXT: seta %al 1029; AVX-NEXT: ret{{[l|q]}} 1030 %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) 1031 ret i32 %res 1032} 1033declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone 1034 1035define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) { 1036; SSE-LABEL: test_mm_testc_si128: 1037; SSE: # %bb.0: 1038; SSE-NEXT: xorl %eax, %eax 1039; SSE-NEXT: ptest %xmm1, %xmm0 1040; SSE-NEXT: setb %al 1041; SSE-NEXT: ret{{[l|q]}} 1042; 1043; AVX-LABEL: test_mm_testc_si128: 1044; AVX: # %bb.0: 1045; AVX-NEXT: xorl %eax, %eax 1046; AVX-NEXT: vptest %xmm1, %xmm0 1047; AVX-NEXT: setb %al 1048; AVX-NEXT: ret{{[l|q]}} 1049 %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) 1050 ret i32 %res 1051} 1052 1053define i32 @test_mm_testnzc_si128(<2 x i64> %a0, <2 x i64> %a1) { 1054; SSE-LABEL: test_mm_testnzc_si128: 1055; SSE: # %bb.0: 1056; SSE-NEXT: xorl %eax, %eax 1057; SSE-NEXT: ptest %xmm1, %xmm0 1058; SSE-NEXT: seta %al 1059; SSE-NEXT: ret{{[l|q]}} 1060; 1061; AVX-LABEL: test_mm_testnzc_si128: 1062; AVX: # %bb.0: 1063; AVX-NEXT: xorl %eax, %eax 1064; AVX-NEXT: vptest %xmm1, %xmm0 1065; AVX-NEXT: seta %al 1066; AVX-NEXT: ret{{[l|q]}} 1067 %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) 1068 ret i32 %res 1069} 1070 1071define i32 @test_mm_testz_si128(<2 x i64> %a0, <2 x i64> %a1) { 1072; SSE-LABEL: test_mm_testz_si128: 1073; SSE: # %bb.0: 1074; SSE-NEXT: xorl %eax, %eax 1075; SSE-NEXT: ptest %xmm1, %xmm0 1076; SSE-NEXT: sete %al 1077; SSE-NEXT: ret{{[l|q]}} 1078; 1079; AVX-LABEL: test_mm_testz_si128: 1080; AVX: # %bb.0: 1081; AVX-NEXT: xorl %eax, %eax 1082; AVX-NEXT: vptest %xmm1, %xmm0 1083; AVX-NEXT: sete %al 1084; AVX-NEXT: ret{{[l|q]}} 1085 %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) 1086 ret i32 %res 1087} 1088