1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX 4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 5; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX 6 7define <4 x float> @test_unpacklo_hadd_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 8; SSE-LABEL: test_unpacklo_hadd_v4f32: 9; SSE: ## %bb.0: 10; SSE-NEXT: haddps %xmm2, %xmm0 11; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 12; SSE-NEXT: ret{{[l|q]}} 13; 14; AVX-LABEL: test_unpacklo_hadd_v4f32: 15; AVX: ## %bb.0: 16; AVX-NEXT: vhaddps %xmm2, %xmm0, %xmm0 17; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 18; AVX-NEXT: ret{{[l|q]}} 19 %5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4 20 %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4 21 %7 = shufflevector <4 x float> %5, <4 x float> %6, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 22 ret <4 x float> %7 23} 24 25define <4 x float> @test_unpackhi_hadd_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 26; SSE-LABEL: test_unpackhi_hadd_v4f32: 27; SSE: ## %bb.0: 28; SSE-NEXT: movaps %xmm1, %xmm0 29; SSE-NEXT: haddps %xmm3, %xmm0 30; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 31; SSE-NEXT: ret{{[l|q]}} 32; 33; AVX-LABEL: test_unpackhi_hadd_v4f32: 34; AVX: ## %bb.0: 35; AVX-NEXT: vhaddps %xmm3, %xmm1, %xmm0 36; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 37; AVX-NEXT: ret{{[l|q]}} 38 %5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4 39 %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4 40 %7 = shufflevector <4 x float> %5, <4 x float> %6, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 41 ret <4 x float> %7 42} 43 44define <4 x float> @test_unpacklo_hsub_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 45; SSE-LABEL: test_unpacklo_hsub_v4f32: 46; SSE: ## %bb.0: 47; SSE-NEXT: hsubps %xmm2, %xmm0 48; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 49; SSE-NEXT: ret{{[l|q]}} 50; 51; AVX-LABEL: test_unpacklo_hsub_v4f32: 52; AVX: ## %bb.0: 53; AVX-NEXT: vhsubps %xmm2, %xmm0, %xmm0 54; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 55; AVX-NEXT: ret{{[l|q]}} 56 %5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4 57 %6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4 58 %7 = shufflevector <4 x float> %5, <4 x float> %6, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 59 ret <4 x float> %7 60} 61 62define <4 x float> @test_unpackhi_hsub_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { 63; SSE-LABEL: test_unpackhi_hsub_v4f32: 64; SSE: ## %bb.0: 65; SSE-NEXT: movaps %xmm1, %xmm0 66; SSE-NEXT: hsubps %xmm3, %xmm0 67; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 68; SSE-NEXT: ret{{[l|q]}} 69; 70; AVX-LABEL: test_unpackhi_hsub_v4f32: 71; AVX: ## %bb.0: 72; AVX-NEXT: vhsubps %xmm3, %xmm1, %xmm0 73; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 74; AVX-NEXT: ret{{[l|q]}} 75 %5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4 76 %6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4 77 %7 = shufflevector <4 x float> %5, <4 x float> %6, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 78 ret <4 x float> %7 79} 80 81define <4 x i32> @test_unpacklo_hadd_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 82; SSE-LABEL: test_unpacklo_hadd_v4i32: 83; SSE: ## %bb.0: 84; SSE-NEXT: phaddd %xmm2, %xmm0 85; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 86; SSE-NEXT: ret{{[l|q]}} 87; 88; AVX-LABEL: test_unpacklo_hadd_v4i32: 89; AVX: ## %bb.0: 90; AVX-NEXT: vphaddd %xmm2, %xmm0, %xmm0 91; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 92; AVX-NEXT: ret{{[l|q]}} 93 %5 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %0, <4 x i32> %1) #5 94 %6 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %2, <4 x i32> %3) #5 95 %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 96 ret <4 x i32> %7 97} 98 99define <4 x i32> @test_unpackhi_hadd_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 100; SSE-LABEL: test_unpackhi_hadd_v4i32: 101; SSE: ## %bb.0: 102; SSE-NEXT: phaddd %xmm3, %xmm1 103; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] 104; SSE-NEXT: ret{{[l|q]}} 105; 106; AVX-LABEL: test_unpackhi_hadd_v4i32: 107; AVX: ## %bb.0: 108; AVX-NEXT: vphaddd %xmm3, %xmm1, %xmm0 109; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 110; AVX-NEXT: ret{{[l|q]}} 111 %5 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %0, <4 x i32> %1) #5 112 %6 = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %2, <4 x i32> %3) #5 113 %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 114 ret <4 x i32> %7 115} 116 117define <4 x i32> @test_unpacklo_hsub_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 118; SSE-LABEL: test_unpacklo_hsub_v4i32: 119; SSE: ## %bb.0: 120; SSE-NEXT: phsubd %xmm2, %xmm0 121; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 122; SSE-NEXT: ret{{[l|q]}} 123; 124; AVX-LABEL: test_unpacklo_hsub_v4i32: 125; AVX: ## %bb.0: 126; AVX-NEXT: vphsubd %xmm2, %xmm0, %xmm0 127; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 128; AVX-NEXT: ret{{[l|q]}} 129 %5 = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %0, <4 x i32> %1) #5 130 %6 = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %2, <4 x i32> %3) #5 131 %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 132 ret <4 x i32> %7 133} 134 135define <4 x i32> @test_unpackhi_hsub_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { 136; SSE-LABEL: test_unpackhi_hsub_v4i32: 137; SSE: ## %bb.0: 138; SSE-NEXT: phsubd %xmm3, %xmm1 139; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] 140; SSE-NEXT: ret{{[l|q]}} 141; 142; AVX-LABEL: test_unpackhi_hsub_v4i32: 143; AVX: ## %bb.0: 144; AVX-NEXT: vphsubd %xmm3, %xmm1, %xmm0 145; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 146; AVX-NEXT: ret{{[l|q]}} 147 %5 = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %0, <4 x i32> %1) #5 148 %6 = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %2, <4 x i32> %3) #5 149 %7 = shufflevector <4 x i32> %5, <4 x i32> %6, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 150 ret <4 x i32> %7 151} 152 153; 154; Special Case 155; 156 157define <4 x float> @test_unpacklo_hadd_v4f32_unary(<4 x float> %0) { 158; SSE-LABEL: test_unpacklo_hadd_v4f32_unary: 159; SSE: ## %bb.0: 160; SSE-NEXT: haddps %xmm0, %xmm0 161; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] 162; SSE-NEXT: ret{{[l|q]}} 163; 164; AVX-LABEL: test_unpacklo_hadd_v4f32_unary: 165; AVX: ## %bb.0: 166; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 167; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] 168; AVX-NEXT: ret{{[l|q]}} 169 %2 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) #4 170 %3 = shufflevector <4 x float> %2, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 171 ret <4 x float> %3 172} 173 174define <8 x i16> @PR51974(<8 x i16> %a0) { 175; SSE-LABEL: PR51974: 176; SSE: ## %bb.0: 177; SSE-NEXT: movdqa %xmm0, %xmm1 178; SSE-NEXT: phaddw %xmm0, %xmm1 179; SSE-NEXT: phaddw %xmm0, %xmm1 180; SSE-NEXT: movdqa %xmm1, %xmm0 181; SSE-NEXT: ret{{[l|q]}} 182; 183; AVX-LABEL: PR51974: 184; AVX: ## %bb.0: 185; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm1 186; AVX-NEXT: vphaddw %xmm0, %xmm1, %xmm0 187; AVX-NEXT: ret{{[l|q]}} 188 %r0 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a0) 189 %r1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %r0, <8 x i16> %a0) 190 ret <8 x i16> %r1 191} 192 193define <8 x i16> @PR52040(<8 x i16> %a0) { 194; SSE-LABEL: PR52040: 195; SSE: ## %bb.0: 196; SSE-NEXT: phaddw %xmm0, %xmm0 197; SSE-NEXT: movdqa %xmm0, %xmm1 198; SSE-NEXT: phaddw %xmm0, %xmm1 199; SSE-NEXT: phaddw %xmm0, %xmm1 200; SSE-NEXT: movdqa %xmm1, %xmm0 201; SSE-NEXT: ret{{[l|q]}} 202; 203; AVX-LABEL: PR52040: 204; AVX: ## %bb.0: 205; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 206; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm1 207; AVX-NEXT: vphaddw %xmm0, %xmm1, %xmm0 208; AVX-NEXT: ret{{[l|q]}} 209 %r1 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a0) 210 %r2 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %r1, <8 x i16> %r1) 211 %r3 = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %r2, <8 x i16> %r1) 212 ret <8 x i16> %r3 213} 214 215declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) 216declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) 217declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) 218declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) 219 220declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) 221declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) 222declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) 223declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) 224 225declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) 226declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) 227declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) 228declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) 229