1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 4 5; 6; 128-bit Vectors 7; 8 9define void @test_demanded_haddps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) nounwind { 10; X86-LABEL: test_demanded_haddps_128: 11; X86: ## %bb.0: 12; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 13; X86-NEXT: vhaddps %xmm0, %xmm0, %xmm0 14; X86-NEXT: vmovss %xmm0, (%eax) 15; X86-NEXT: retl 16; 17; X64-LABEL: test_demanded_haddps_128: 18; X64: ## %bb.0: 19; X64-NEXT: vhaddps %xmm0, %xmm0, %xmm0 20; X64-NEXT: vmovss %xmm0, (%rdi) 21; X64-NEXT: retq 22 %1 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer 23 %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %1) 24 %3 = extractelement <4 x float> %2, i32 0 25 store float %3, ptr%a2 26 ret void 27} 28 29define void @test_demanded_hsubps_128(<4 x float> %a0, <4 x float> %a1, ptr%a2) nounwind { 30; X86-LABEL: test_demanded_hsubps_128: 31; X86: ## %bb.0: 32; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 33; X86-NEXT: vhsubps %xmm1, %xmm0, %xmm0 34; X86-NEXT: vextractps $2, %xmm0, (%eax) 35; X86-NEXT: retl 36; 37; X64-LABEL: test_demanded_hsubps_128: 38; X64: ## %bb.0: 39; X64-NEXT: vhsubps %xmm1, %xmm0, %xmm0 40; X64-NEXT: vextractps $2, %xmm0, (%rdi) 41; X64-NEXT: retq 42 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 43 %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %a1) 44 %3 = extractelement <4 x float> %2, i32 2 45 store float %3, ptr%a2 46 ret void 47} 48 49define void @test_demanded_haddpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2) nounwind { 50; X86-LABEL: test_demanded_haddpd_128: 51; X86: ## %bb.0: 52; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 53; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 54; X86-NEXT: vmovlpd %xmm0, (%eax) 55; X86-NEXT: retl 56; 57; X64-LABEL: test_demanded_haddpd_128: 58; X64: ## %bb.0: 59; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 60; X64-NEXT: vmovlpd %xmm0, (%rdi) 61; X64-NEXT: retq 62 %1 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer 63 %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %1) 64 %3 = extractelement <2 x double> %2, i32 0 65 store double %3, ptr%a2 66 ret void 67} 68 69define void @test_demanded_hsubpd_128(<2 x double> %a0, <2 x double> %a1, ptr%a2) nounwind { 70; X86-LABEL: test_demanded_hsubpd_128: 71; X86: ## %bb.0: 72; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 73; X86-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 74; X86-NEXT: vmovlpd %xmm0, (%eax) 75; X86-NEXT: retl 76; 77; X64-LABEL: test_demanded_hsubpd_128: 78; X64: ## %bb.0: 79; X64-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 80; X64-NEXT: vmovlpd %xmm0, (%rdi) 81; X64-NEXT: retq 82 %1 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer 83 %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %1) 84 %3 = extractelement <2 x double> %2, i32 0 85 store double %3, ptr%a2 86 ret void 87} 88 89define void @test_demanded_phaddd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) nounwind { 90; X86-LABEL: test_demanded_phaddd_128: 91; X86: ## %bb.0: 92; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 93; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0 94; X86-NEXT: vmovd %xmm0, (%eax) 95; X86-NEXT: retl 96; 97; X64-LABEL: test_demanded_phaddd_128: 98; X64: ## %bb.0: 99; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0 100; X64-NEXT: vmovd %xmm0, (%rdi) 101; X64-NEXT: retq 102 %1 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> zeroinitializer 103 %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %1) 104 %3 = extractelement <4 x i32> %2, i32 0 105 store i32 %3, ptr%a2 106 ret void 107} 108 109define void @test_demanded_phsubd_128(<4 x i32> %a0, <4 x i32> %a1, ptr%a2) nounwind { 110; X86-LABEL: test_demanded_phsubd_128: 111; X86: ## %bb.0: 112; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 113; X86-NEXT: vphsubd %xmm0, %xmm0, %xmm0 114; X86-NEXT: vpextrd $1, %xmm0, (%eax) 115; X86-NEXT: retl 116; 117; X64-LABEL: test_demanded_phsubd_128: 118; X64: ## %bb.0: 119; X64-NEXT: vphsubd %xmm0, %xmm0, %xmm0 120; X64-NEXT: vpextrd $1, %xmm0, (%rdi) 121; X64-NEXT: retq 122 %1 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> zeroinitializer 123 %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %1) 124 %3 = extractelement <4 x i32> %2, i32 1 125 store i32 %3, ptr%a2 126 ret void 127} 128 129define void @test_demanded_phaddw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind { 130; X86-LABEL: test_demanded_phaddw_128: 131; X86: ## %bb.0: 132; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 133; X86-NEXT: vphaddw %xmm0, %xmm0, %xmm0 134; X86-NEXT: vpextrw $0, %xmm0, (%eax) 135; X86-NEXT: retl 136; 137; X64-LABEL: test_demanded_phaddw_128: 138; X64: ## %bb.0: 139; X64-NEXT: vphaddw %xmm0, %xmm0, %xmm0 140; X64-NEXT: vpextrw $0, %xmm0, (%rdi) 141; X64-NEXT: retq 142 %1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> zeroinitializer 143 %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %1) 144 %3 = extractelement <8 x i16> %2, i16 0 145 store i16 %3, ptr%a2 146 ret void 147} 148 149define void @test_demanded_phsubw_128(<8 x i16> %a0, <8 x i16> %a1, ptr%a2) nounwind { 150; X86-LABEL: test_demanded_phsubw_128: 151; X86: ## %bb.0: 152; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 153; X86-NEXT: vphsubw %xmm0, %xmm0, %xmm0 154; X86-NEXT: vpextrw $2, %xmm0, (%eax) 155; X86-NEXT: retl 156; 157; X64-LABEL: test_demanded_phsubw_128: 158; X64: ## %bb.0: 159; X64-NEXT: vphsubw %xmm0, %xmm0, %xmm0 160; X64-NEXT: vpextrw $2, %xmm0, (%rdi) 161; X64-NEXT: retq 162 %1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> zeroinitializer 163 %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %1) 164 %3 = extractelement <8 x i16> %2, i16 2 165 store i16 %3, ptr%a2 166 ret void 167} 168 169; 170; 256-bit Vectors 171; 172 173define void @test_demanded_haddps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) nounwind { 174; X86-LABEL: test_demanded_haddps_256: 175; X86: ## %bb.0: 176; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 177; X86-NEXT: vhaddps %ymm0, %ymm0, %ymm0 178; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 179; X86-NEXT: vmovss %xmm0, (%eax) 180; X86-NEXT: vzeroupper 181; X86-NEXT: retl 182; 183; X64-LABEL: test_demanded_haddps_256: 184; X64: ## %bb.0: 185; X64-NEXT: vhaddps %ymm0, %ymm0, %ymm0 186; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 187; X64-NEXT: vmovss %xmm0, (%rdi) 188; X64-NEXT: vzeroupper 189; X64-NEXT: retq 190 %1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> zeroinitializer 191 %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %1) 192 %3 = extractelement <8 x float> %2, i32 4 193 store float %3, ptr%a2 194 ret void 195} 196 197define void @test_demanded_hsubps_256(<8 x float> %a0, <8 x float> %a1, ptr%a2) nounwind { 198; X86-LABEL: test_demanded_hsubps_256: 199; X86: ## %bb.0: 200; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 201; X86-NEXT: vhsubps %ymm1, %ymm0, %ymm0 202; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 203; X86-NEXT: vextractps $3, %xmm0, (%eax) 204; X86-NEXT: vzeroupper 205; X86-NEXT: retl 206; 207; X64-LABEL: test_demanded_hsubps_256: 208; X64: ## %bb.0: 209; X64-NEXT: vhsubps %ymm1, %ymm0, %ymm0 210; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 211; X64-NEXT: vextractps $3, %xmm0, (%rdi) 212; X64-NEXT: vzeroupper 213; X64-NEXT: retq 214 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer 215 %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %a1) 216 %3 = extractelement <8 x float> %2, i32 7 217 store float %3, ptr%a2 218 ret void 219} 220 221define void @test_demanded_haddpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind { 222; X86-LABEL: test_demanded_haddpd_256: 223; X86: ## %bb.0: 224; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 225; X86-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 226; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 227; X86-NEXT: vmovlpd %xmm0, (%eax) 228; X86-NEXT: vzeroupper 229; X86-NEXT: retl 230; 231; X64-LABEL: test_demanded_haddpd_256: 232; X64: ## %bb.0: 233; X64-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 234; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 235; X64-NEXT: vmovlpd %xmm0, (%rdi) 236; X64-NEXT: vzeroupper 237; X64-NEXT: retq 238 %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> zeroinitializer 239 %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %1) 240 %3 = extractelement <4 x double> %2, i32 2 241 store double %3, ptr%a2 242 ret void 243} 244 245define void @test_demanded_hsubpd_256(<4 x double> %a0, <4 x double> %a1, ptr%a2) nounwind { 246; X86-LABEL: test_demanded_hsubpd_256: 247; X86: ## %bb.0: 248; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 249; X86-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 250; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 251; X86-NEXT: vmovlpd %xmm0, (%eax) 252; X86-NEXT: vzeroupper 253; X86-NEXT: retl 254; 255; X64-LABEL: test_demanded_hsubpd_256: 256; X64: ## %bb.0: 257; X64-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 258; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 259; X64-NEXT: vmovlpd %xmm0, (%rdi) 260; X64-NEXT: vzeroupper 261; X64-NEXT: retq 262 %1 = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> zeroinitializer 263 %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %1) 264 %3 = extractelement <4 x double> %2, i32 2 265 store double %3, ptr%a2 266 ret void 267} 268 269define void @test_demanded_phaddd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) nounwind { 270; X86-LABEL: test_demanded_phaddd_256: 271; X86: ## %bb.0: 272; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 273; X86-NEXT: vphaddd %ymm1, %ymm0, %ymm0 274; X86-NEXT: vextracti128 $1, %ymm0, %xmm0 275; X86-NEXT: vpextrd $3, %xmm0, (%eax) 276; X86-NEXT: vzeroupper 277; X86-NEXT: retl 278; 279; X64-LABEL: test_demanded_phaddd_256: 280; X64: ## %bb.0: 281; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0 282; X64-NEXT: vextracti128 $1, %ymm0, %xmm0 283; X64-NEXT: vpextrd $3, %xmm0, (%rdi) 284; X64-NEXT: vzeroupper 285; X64-NEXT: retq 286 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer 287 %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %a1) 288 %3 = extractelement <8 x i32> %2, i32 7 289 store i32 %3, ptr%a2 290 ret void 291} 292 293define void @test_demanded_phsubd_256(<8 x i32> %a0, <8 x i32> %a1, ptr%a2) nounwind { 294; X86-LABEL: test_demanded_phsubd_256: 295; X86: ## %bb.0: 296; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 297; X86-NEXT: vphsubd %ymm0, %ymm0, %ymm0 298; X86-NEXT: vextracti128 $1, %ymm0, %xmm0 299; X86-NEXT: vpextrd $1, %xmm0, (%eax) 300; X86-NEXT: vzeroupper 301; X86-NEXT: retl 302; 303; X64-LABEL: test_demanded_phsubd_256: 304; X64: ## %bb.0: 305; X64-NEXT: vphsubd %ymm0, %ymm0, %ymm0 306; X64-NEXT: vextracti128 $1, %ymm0, %xmm0 307; X64-NEXT: vpextrd $1, %xmm0, (%rdi) 308; X64-NEXT: vzeroupper 309; X64-NEXT: retq 310 %1 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> zeroinitializer 311 %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %1) 312 %3 = extractelement <8 x i32> %2, i32 5 313 store i32 %3, ptr%a2 314 ret void 315} 316 317define void @test_demanded_phaddw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind { 318; X86-LABEL: test_demanded_phaddw_256: 319; X86: ## %bb.0: 320; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 321; X86-NEXT: vpbroadcastw %xmm1, %xmm0 322; X86-NEXT: vphaddw %xmm0, %xmm0, %xmm0 323; X86-NEXT: vpextrw $4, %xmm0, (%eax) 324; X86-NEXT: vzeroupper 325; X86-NEXT: retl 326; 327; X64-LABEL: test_demanded_phaddw_256: 328; X64: ## %bb.0: 329; X64-NEXT: vpbroadcastw %xmm1, %xmm0 330; X64-NEXT: vphaddw %xmm0, %xmm0, %xmm0 331; X64-NEXT: vpextrw $4, %xmm0, (%rdi) 332; X64-NEXT: vzeroupper 333; X64-NEXT: retq 334 %1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> zeroinitializer 335 %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %1) 336 %3 = extractelement <16 x i16> %2, i32 4 337 store i16 %3, ptr%a2 338 ret void 339} 340 341define void @test_demanded_phsubw_256(<16 x i16> %a0, <16 x i16> %a1, ptr%a2) nounwind { 342; X86-LABEL: test_demanded_phsubw_256: 343; X86: ## %bb.0: 344; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 345; X86-NEXT: vphsubw %xmm1, %xmm0, %xmm0 346; X86-NEXT: vpextrw $6, %xmm0, (%eax) 347; X86-NEXT: vzeroupper 348; X86-NEXT: retl 349; 350; X64-LABEL: test_demanded_phsubw_256: 351; X64: ## %bb.0: 352; X64-NEXT: vphsubw %xmm1, %xmm0, %xmm0 353; X64-NEXT: vpextrw $6, %xmm0, (%rdi) 354; X64-NEXT: vzeroupper 355; X64-NEXT: retq 356 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> zeroinitializer 357 %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %1, <16 x i16> %a1) 358 %3 = extractelement <16 x i16> %2, i32 6 359 store i16 %3, ptr%a2 360 ret void 361} 362 363declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) 364declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) 365declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) 366declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) 367 368declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) 369declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) 370declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) 371declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) 372 373declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) 374declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) 375declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) 376declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) 377 378declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) 379declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) 380declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) 381declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) 382