1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s 4 5; 6; 128-bit Vectors 7; 8 9define <4 x float> @test_unpackl_fhadd_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { 10; CHECK-LABEL: test_unpackl_fhadd_128: 11; CHECK: ## %bb.0: 12; CHECK-NEXT: vhaddps %xmm2, %xmm0, %xmm0 13; CHECK-NEXT: ret{{[l|q]}} 14 %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) 15 %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a2, <4 x float> %a3) 16 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 17 ret <4 x float> %3 18} 19 20define <2 x double> @test_unpackh_fhadd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { 21; CHECK-LABEL: test_unpackh_fhadd_128: 22; CHECK: ## %bb.0: 23; CHECK-NEXT: vhaddpd %xmm3, %xmm1, %xmm0 24; CHECK-NEXT: ret{{[l|q]}} 25 %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) 26 %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a2, <2 x double> %a3) 27 %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 1, i32 3> 28 ret <2 x double> %3 29} 30 31define <2 x double> @test_unpackl_fhsub_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) { 32; CHECK-LABEL: test_unpackl_fhsub_128: 33; CHECK: ## %bb.0: 34; CHECK-NEXT: vhsubpd %xmm2, %xmm0, %xmm0 35; CHECK-NEXT: ret{{[l|q]}} 36 %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) 37 %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a2, <2 x double> %a3) 38 %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2> 39 ret <2 x double> %3 40} 41 42define <4 x float> @test_unpackh_fhsub_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) { 43; CHECK-LABEL: test_unpackh_fhsub_128: 44; CHECK: ## %bb.0: 45; CHECK-NEXT: vhsubps %xmm3, %xmm1, %xmm0 46; CHECK-NEXT: ret{{[l|q]}} 47 %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) 48 %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a2, <4 x float> %a3) 49 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 50 ret <4 x float> %3 51} 52 53define <8 x i16> @test_unpackl_hadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { 54; CHECK-LABEL: test_unpackl_hadd_128: 55; CHECK: ## %bb.0: 56; CHECK-NEXT: vphaddw %xmm2, %xmm0, %xmm0 57; CHECK-NEXT: ret{{[l|q]}} 58 %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) 59 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a2, <8 x i16> %a3) 60 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 61 ret <8 x i16> %3 62} 63 64define <4 x i32> @test_unpackh_hadd_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { 65; CHECK-LABEL: test_unpackh_hadd_128: 66; CHECK: ## %bb.0: 67; CHECK-NEXT: vphaddd %xmm3, %xmm1, %xmm0 68; CHECK-NEXT: ret{{[l|q]}} 69 %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) 70 %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a2, <4 x i32> %a3) 71 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 72 ret <4 x i32> %3 73} 74 75define <4 x i32> @test_unpackl_hsub_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { 76; CHECK-LABEL: test_unpackl_hsub_128: 77; CHECK: ## %bb.0: 78; CHECK-NEXT: vphsubd %xmm2, %xmm0, %xmm0 79; CHECK-NEXT: ret{{[l|q]}} 80 %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) 81 %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a2, <4 x i32> %a3) 82 %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 83 ret <4 x i32> %3 84} 85 86define <8 x i16> @test_unpackh_hsub_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { 87; CHECK-LABEL: test_unpackh_hsub_128: 88; CHECK: ## %bb.0: 89; CHECK-NEXT: vphsubw %xmm3, %xmm1, %xmm0 90; CHECK-NEXT: ret{{[l|q]}} 91 %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) 92 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a2, <8 x i16> %a3) 93 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 94 ret <8 x i16> %3 95} 96 97define <16 x i8> @test_unpackl_packss_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { 98; CHECK-LABEL: test_unpackl_packss_128: 99; CHECK: ## %bb.0: 100; CHECK-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 101; CHECK-NEXT: ret{{[l|q]}} 102 %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) 103 %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a2, <8 x i16> %a3) 104 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 105 ret <16 x i8> %3 106} 107 108define <8 x i16> @test_unpackh_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { 109; CHECK-LABEL: test_unpackh_packss_128: 110; CHECK: ## %bb.0: 111; CHECK-NEXT: vpackssdw %xmm3, %xmm1, %xmm0 112; CHECK-NEXT: ret{{[l|q]}} 113 %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) 114 %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a2, <4 x i32> %a3) 115 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 116 ret <8 x i16> %3 117} 118 119define <8 x i16> @test_unpackl_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { 120; CHECK-LABEL: test_unpackl_packus_128: 121; CHECK: ## %bb.0: 122; CHECK-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 123; CHECK-NEXT: ret{{[l|q]}} 124 %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) 125 %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3) 126 %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11> 127 ret <8 x i16> %3 128} 129 130define <16 x i8> @test_unpackh_packus_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { 131; CHECK-LABEL: test_unpackh_packus_128: 132; CHECK: ## %bb.0: 133; CHECK-NEXT: vpackuswb %xmm3, %xmm1, %xmm0 134; CHECK-NEXT: ret{{[l|q]}} 135 %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) 136 %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3) 137 %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 138 ret <16 x i8> %3 139} 140 141define <4 x float> @test_shufps_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) { 142; CHECK-LABEL: test_shufps_packss_128: 143; CHECK: ## %bb.0: 144; CHECK-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 145; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 146; CHECK-NEXT: ret{{[l|q]}} 147 %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) 148 %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a2, <4 x i32> %a3) 149 %3 = bitcast <8 x i16> %1 to <4 x float> 150 %4 = bitcast <8 x i16> %2 to <4 x float> 151 %5 = shufflevector <4 x float> %3, <4 x float> %4, <4 x i32> <i32 0, i32 1, i32 6, i32 6> 152 ret <4 x float> %5 153} 154 155define <4 x float> @test_shufps_packus_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) { 156; CHECK-LABEL: test_shufps_packus_128: 157; CHECK: ## %bb.0: 158; CHECK-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 159; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,2] 160; CHECK-NEXT: ret{{[l|q]}} 161 %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) 162 %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3) 163 %3 = bitcast <16 x i8> %1 to <4 x float> 164 %4 = bitcast <16 x i8> %2 to <4 x float> 165 %5 = shufflevector <4 x float> %3, <4 x float> %4, <4 x i32> <i32 1, i32 0, i32 4, i32 4> 166 ret <4 x float> %5 167} 168 169; 170; 256-bit Vectors 171; 172 173define <8 x float> @test_unpackl_fhadd_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) { 174; CHECK-LABEL: test_unpackl_fhadd_256: 175; CHECK: ## %bb.0: 176; CHECK-NEXT: vhaddps %ymm2, %ymm0, %ymm0 177; CHECK-NEXT: ret{{[l|q]}} 178 %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) 179 %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3) 180 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13> 181 ret <8 x float> %3 182} 183 184define <4 x double> @test_unpackh_fhadd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) { 185; CHECK-LABEL: test_unpackh_fhadd_256: 186; CHECK: ## %bb.0: 187; CHECK-NEXT: vhaddpd %ymm3, %ymm1, %ymm0 188; CHECK-NEXT: ret{{[l|q]}} 189 %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) 190 %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a2, <4 x double> %a3) 191 %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> 192 ret <4 x double> %3 193} 194 195define <4 x double> @test_unpackl_fhsub_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) { 196; CHECK-LABEL: test_unpackl_fhsub_256: 197; CHECK: ## %bb.0: 198; CHECK-NEXT: vhsubpd %ymm2, %ymm0, %ymm0 199; CHECK-NEXT: ret{{[l|q]}} 200 %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) 201 %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a2, <4 x double> %a3) 202 %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> 203 ret <4 x double> %3 204} 205 206define <8 x float> @test_unpackh_fhsub_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) { 207; CHECK-LABEL: test_unpackh_fhsub_256: 208; CHECK: ## %bb.0: 209; CHECK-NEXT: vhsubps %ymm3, %ymm1, %ymm0 210; CHECK-NEXT: ret{{[l|q]}} 211 %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) 212 %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a2, <8 x float> %a3) 213 %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15> 214 ret <8 x float> %3 215} 216 217define <16 x i16> @test_unpackl_hadd_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) { 218; CHECK-LABEL: test_unpackl_hadd_256: 219; CHECK: ## %bb.0: 220; CHECK-NEXT: vphaddw %ymm2, %ymm0, %ymm0 221; CHECK-NEXT: ret{{[l|q]}} 222 %1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) 223 %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a2, <16 x i16> %a3) 224 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27> 225 ret <16 x i16> %3 226} 227 228define <8 x i32> @test_unpackh_hadd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) { 229; CHECK-LABEL: test_unpackh_hadd_256: 230; CHECK: ## %bb.0: 231; CHECK-NEXT: vphaddd %ymm3, %ymm1, %ymm0 232; CHECK-NEXT: ret{{[l|q]}} 233 %1 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) 234 %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a2, <8 x i32> %a3) 235 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15> 236 ret <8 x i32> %3 237} 238 239define <8 x i32> @test_unpackl_hsub_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) { 240; CHECK-LABEL: test_unpackl_hsub_256: 241; CHECK: ## %bb.0: 242; CHECK-NEXT: vphsubd %ymm2, %ymm0, %ymm0 243; CHECK-NEXT: ret{{[l|q]}} 244 %1 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) 245 %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a2, <8 x i32> %a3) 246 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13> 247 ret <8 x i32> %3 248} 249 250define <16 x i16> @test_unpackh_hsub_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) { 251; CHECK-LABEL: test_unpackh_hsub_256: 252; CHECK: ## %bb.0: 253; CHECK-NEXT: vphsubw %ymm3, %ymm1, %ymm0 254; CHECK-NEXT: ret{{[l|q]}} 255 %1 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) 256 %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a2, <16 x i16> %a3) 257 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31> 258 ret <16 x i16> %3 259} 260 261define <32 x i8> @test_unpackl_packss_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) { 262; CHECK-LABEL: test_unpackl_packss_256: 263; CHECK: ## %bb.0: 264; CHECK-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 265; CHECK-NEXT: ret{{[l|q]}} 266 %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) 267 %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3) 268 %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55> 269 ret <32 x i8> %3 270} 271 272define <16 x i16> @test_unpackh_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) { 273; CHECK-LABEL: test_unpackh_packss_256: 274; CHECK: ## %bb.0: 275; CHECK-NEXT: vpackssdw %ymm3, %ymm1, %ymm0 276; CHECK-NEXT: ret{{[l|q]}} 277 %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) 278 %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3) 279 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31> 280 ret <16 x i16> %3 281} 282 283define <16 x i16> @test_unpackl_packus_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) { 284; CHECK-LABEL: test_unpackl_packus_256: 285; CHECK: ## %bb.0: 286; CHECK-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 287; CHECK-NEXT: ret{{[l|q]}} 288 %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) 289 %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3) 290 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27> 291 ret <16 x i16> %3 292} 293 294define <32 x i8> @test_unpackh_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) { 295; CHECK-LABEL: test_unpackh_packus_256: 296; CHECK: ## %bb.0: 297; CHECK-NEXT: vpacksswb %ymm3, %ymm1, %ymm0 298; CHECK-NEXT: ret{{[l|q]}} 299 %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) 300 %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3) 301 %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 302 ret <32 x i8> %3 303} 304 305define <8 x float> @test_shufps_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) { 306; CHECK-LABEL: test_shufps_packss_256: 307; CHECK: ## %bb.0: 308; CHECK-NEXT: vpackssdw %ymm3, %ymm0, %ymm0 309; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6] 310; CHECK-NEXT: ret{{[l|q]}} 311 %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) 312 %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3) 313 %3 = bitcast <16 x i16> %1 to <8 x float> 314 %4 = bitcast <16 x i16> %2 to <8 x float> 315 %5 = shufflevector <8 x float> %3, <8 x float> %4, <8 x i32> <i32 0, i32 1, i32 10, i32 10, i32 4, i32 5, i32 14, i32 14> 316 ret <8 x float> %5 317} 318 319define <8 x float> @test_shufps_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) { 320; CHECK-LABEL: test_shufps_packus_256: 321; CHECK: ## %bb.0: 322; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 323; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] 324; CHECK-NEXT: ret{{[l|q]}} 325 %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) 326 %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a2, <16 x i16> %a3) 327 %3 = bitcast <32 x i8> %1 to <8 x float> 328 %4 = bitcast <32 x i8> %2 to <8 x float> 329 %5 = shufflevector <8 x float> %3, <8 x float> %4, <8 x i32> <i32 1, i32 0, i32 8, i32 8, i32 5, i32 4, i32 12, i32 12> 330 ret <8 x float> %5 331} 332 333declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) 334declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) 335declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) 336declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) 337 338declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) 339declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) 340declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) 341declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) 342 343declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) 344declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) 345declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) 346declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) 347 348declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) 349declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) 350declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) 351declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) 352 353declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) 354declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) 355declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) 356declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) 357 358declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) 359declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) 360declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) 361declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) 362