1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSE 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2 8 9define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { 10; SSE-LABEL: hadd_reverse_v8i16: 11; SSE: # %bb.0: 12; SSE-NEXT: phaddw %xmm1, %xmm0 13; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 14; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 15; SSE-NEXT: retq 16; 17; AVX-LABEL: hadd_reverse_v8i16: 18; AVX: # %bb.0: 19; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 20; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 21; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 22; AVX-NEXT: retq 23 %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 7, i32 5, i32 3, i32 1, i32 15, i32 13, i32 11, i32 9> 24 %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 6, i32 4, i32 2, i32 0, i32 14, i32 12, i32 10, i32 8> 25 %add = add <8 x i16> %lhs, %rhs 26 ret <8 x i16> %add 27} 28 29define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { 30; SSE-LABEL: hadd_reverse2_v8i16: 31; SSE: # %bb.0: 32; SSE-NEXT: movdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 33; SSE-NEXT: pshufb %xmm2, %xmm0 34; SSE-NEXT: pshufb %xmm2, %xmm1 35; SSE-NEXT: phaddw %xmm1, %xmm0 36; SSE-NEXT: retq 37; 38; AVX-LABEL: hadd_reverse2_v8i16: 39; AVX: # %bb.0: 40; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 41; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 42; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 43; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 44; AVX-NEXT: retq 45 %shuf0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 46 %shuf1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 47 %lhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 48 %rhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 49 %add = add <8 x i16> %lhs, %rhs 50 ret <8 x i16> %add 51} 52 53define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) { 54; SSE-LABEL: hadd_reverse_v8f32: 55; SSE: # %bb.0: 56; SSE-NEXT: movaps %xmm0, %xmm4 57; SSE-NEXT: haddps %xmm3, %xmm1 58; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,2] 59; SSE-NEXT: haddps %xmm2, %xmm4 60; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0,3,2] 61; SSE-NEXT: movaps %xmm1, %xmm0 62; SSE-NEXT: movaps %xmm4, %xmm1 63; SSE-NEXT: retq 64; 65; AVX1-LABEL: hadd_reverse_v8f32: 66; AVX1: # %bb.0: 67; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 68; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 69; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 70; AVX1-NEXT: retq 71; 72; AVX2-LABEL: hadd_reverse_v8f32: 73; AVX2: # %bb.0: 74; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 75; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 76; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 77; AVX2-NEXT: retq 78 %lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 7, i32 5, i32 15, i32 13, i32 3, i32 1, i32 11, i32 9> 79 %rhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 6, i32 4, i32 14, i32 12, i32 2, i32 0, i32 10, i32 8> 80 %add = fadd <8 x float> %lhs, %rhs 81 ret <8 x float> %add 82} 83 84define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) { 85; SSE-LABEL: hadd_reverse2_v8f32: 86; SSE: # %bb.0: 87; SSE-NEXT: movaps %xmm0, %xmm4 88; SSE-NEXT: haddps %xmm3, %xmm1 89; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,2] 90; SSE-NEXT: haddps %xmm2, %xmm4 91; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0,3,2] 92; SSE-NEXT: movaps %xmm1, %xmm0 93; SSE-NEXT: movaps %xmm4, %xmm1 94; SSE-NEXT: retq 95; 96; AVX1-LABEL: hadd_reverse2_v8f32: 97; AVX1: # %bb.0: 98; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 99; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 100; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 101; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 102; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 103; AVX1-NEXT: retq 104; 105; AVX2-LABEL: hadd_reverse2_v8f32: 106; AVX2: # %bb.0: 107; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 108; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 109; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] 110; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] 111; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 112; AVX2-NEXT: retq 113 %shuf0 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 114 %shuf1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 115 %lhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 116 %rhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 117 %add = fadd <8 x float> %lhs, %rhs 118 ret <8 x float> %add 119} 120 121define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) { 122; SSE-LABEL: hadd_reverse3_v8f32: 123; SSE: # %bb.0: 124; SSE-NEXT: haddps %xmm1, %xmm3 125; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] 126; SSE-NEXT: haddps %xmm0, %xmm2 127; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2] 128; SSE-NEXT: movaps %xmm3, %xmm0 129; SSE-NEXT: movaps %xmm2, %xmm1 130; SSE-NEXT: retq 131; 132; AVX1-LABEL: hadd_reverse3_v8f32: 133; AVX1: # %bb.0: 134; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0 135; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 136; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 137; AVX1-NEXT: retq 138; 139; AVX2-LABEL: hadd_reverse3_v8f32: 140; AVX2: # %bb.0: 141; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0 142; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 143; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 144; AVX2-NEXT: retq 145 %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14> 146 %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15> 147 %add = fadd <8 x float> %shuf0, %shuf1 148 %shuf2 = shufflevector <8 x float> %add, <8 x float> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 149 ret <8 x float> %shuf2 150} 151 152define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { 153; SSE-LABEL: hadd_reverse_v16i16: 154; SSE: # %bb.0: 155; SSE-NEXT: phaddw %xmm3, %xmm1 156; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] 157; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] 158; SSE-NEXT: phaddw %xmm2, %xmm0 159; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 160; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4] 161; SSE-NEXT: movdqa %xmm3, %xmm0 162; SSE-NEXT: retq 163; 164; AVX1-LABEL: hadd_reverse_v16i16: 165; AVX1: # %bb.0: 166; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 167; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 168; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2 169; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] 170; AVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] 171; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 172; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 173; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 174; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 175; AVX1-NEXT: retq 176; 177; AVX2-LABEL: hadd_reverse_v16i16: 178; AVX2: # %bb.0: 179; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 180; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] 181; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] 182; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 183; AVX2-NEXT: retq 184 %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17> 185 %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16> 186 %add = add <16 x i16> %lhs, %rhs 187 ret <16 x i16> %add 188} 189 190define <16 x i16> @hadd_reverse2_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { 191; SSE-LABEL: hadd_reverse2_v16i16: 192; SSE: # %bb.0: 193; SSE-NEXT: movdqa %xmm0, %xmm4 194; SSE-NEXT: movdqa {{.*#+}} xmm0 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 195; SSE-NEXT: pshufb %xmm0, %xmm4 196; SSE-NEXT: pshufb %xmm0, %xmm1 197; SSE-NEXT: pshufb %xmm0, %xmm2 198; SSE-NEXT: phaddw %xmm2, %xmm4 199; SSE-NEXT: pshufb %xmm0, %xmm3 200; SSE-NEXT: phaddw %xmm3, %xmm1 201; SSE-NEXT: movdqa %xmm1, %xmm0 202; SSE-NEXT: movdqa %xmm4, %xmm1 203; SSE-NEXT: retq 204; 205; AVX1-LABEL: hadd_reverse2_v16i16: 206; AVX1: # %bb.0: 207; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 208; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] 209; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 210; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 211; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 212; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 213; AVX1-NEXT: vphaddw %xmm4, %xmm2, %xmm2 214; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 215; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 216; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 217; AVX1-NEXT: retq 218; 219; AVX2-LABEL: hadd_reverse2_v16i16: 220; AVX2: # %bb.0: 221; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17] 222; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 223; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] 224; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 225; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] 226; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0 227; AVX2-NEXT: retq 228 %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 229 %shuf1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 230 %lhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30> 231 %rhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31> 232 %add = add <16 x i16> %lhs, %rhs 233 ret <16 x i16> %add 234} 235 236define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { 237; SSE-LABEL: hadd_reverse_v8f64: 238; SSE: # %bb.0: 239; SSE-NEXT: movapd %xmm1, %xmm8 240; SSE-NEXT: movapd %xmm0, %xmm9 241; SSE-NEXT: haddpd %xmm7, %xmm3 242; SSE-NEXT: haddpd %xmm6, %xmm2 243; SSE-NEXT: haddpd %xmm5, %xmm8 244; SSE-NEXT: haddpd %xmm4, %xmm9 245; SSE-NEXT: movapd %xmm3, %xmm0 246; SSE-NEXT: movapd %xmm2, %xmm1 247; SSE-NEXT: movapd %xmm8, %xmm2 248; SSE-NEXT: movapd %xmm9, %xmm3 249; SSE-NEXT: retq 250; 251; AVX1-LABEL: hadd_reverse_v8f64: 252; AVX1: # %bb.0: 253; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 254; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] 255; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 256; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 257; AVX1-NEXT: vmovapd %ymm3, %ymm0 258; AVX1-NEXT: retq 259; 260; AVX2-LABEL: hadd_reverse_v8f64: 261; AVX2: # %bb.0: 262; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 263; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] 264; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 265; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] 266; AVX2-NEXT: vmovapd %ymm3, %ymm0 267; AVX2-NEXT: retq 268 %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 7, i32 15, i32 5, i32 13, i32 3, i32 11, i32 1, i32 9> 269 %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 6, i32 14, i32 4, i32 12, i32 2, i32 10, i32 0, i32 8> 270 %fadd = fadd <8 x double> %lhs, %rhs 271 ret <8 x double> %fadd 272} 273 274define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { 275; SSE-LABEL: hadd_reverse2_v8f64: 276; SSE: # %bb.0: 277; SSE-NEXT: movapd %xmm1, %xmm8 278; SSE-NEXT: movapd %xmm0, %xmm9 279; SSE-NEXT: haddpd %xmm7, %xmm3 280; SSE-NEXT: haddpd %xmm6, %xmm2 281; SSE-NEXT: haddpd %xmm5, %xmm8 282; SSE-NEXT: haddpd %xmm4, %xmm9 283; SSE-NEXT: movapd %xmm3, %xmm0 284; SSE-NEXT: movapd %xmm2, %xmm1 285; SSE-NEXT: movapd %xmm8, %xmm2 286; SSE-NEXT: movapd %xmm9, %xmm3 287; SSE-NEXT: retq 288; 289; AVX1-LABEL: hadd_reverse2_v8f64: 290; AVX1: # %bb.0: 291; AVX1-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 292; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1] 293; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 294; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 295; AVX1-NEXT: vmovapd %ymm3, %ymm0 296; AVX1-NEXT: retq 297; 298; AVX2-LABEL: hadd_reverse2_v8f64: 299; AVX2: # %bb.0: 300; AVX2-NEXT: vhaddpd %ymm3, %ymm1, %ymm1 301; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] 302; AVX2-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 303; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] 304; AVX2-NEXT: vmovapd %ymm3, %ymm0 305; AVX2-NEXT: retq 306 %shuf0 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 307 %shuf1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 308 %lhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 309 %rhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 310 %fadd = fadd <8 x double> %lhs, %rhs 311 ret <8 x double> %fadd 312} 313 314define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { 315; SSE-LABEL: hadd_reverse_v16f32: 316; SSE: # %bb.0: 317; SSE-NEXT: movaps %xmm0, %xmm8 318; SSE-NEXT: haddps %xmm3, %xmm2 319; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,2,1,0] 320; SSE-NEXT: haddps %xmm7, %xmm6 321; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2,1,0] 322; SSE-NEXT: haddps %xmm1, %xmm8 323; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,2,1,0] 324; SSE-NEXT: haddps %xmm5, %xmm4 325; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] 326; SSE-NEXT: movaps %xmm2, %xmm0 327; SSE-NEXT: movaps %xmm6, %xmm1 328; SSE-NEXT: movaps %xmm8, %xmm2 329; SSE-NEXT: movaps %xmm4, %xmm3 330; SSE-NEXT: retq 331; 332; AVX1-LABEL: hadd_reverse_v16f32: 333; AVX1: # %bb.0: 334; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3] 335; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 336; AVX1-NEXT: vhaddps %ymm0, %ymm4, %ymm2 337; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3] 338; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 339; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 340; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 341; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6] 342; AVX1-NEXT: retq 343; 344; AVX2-LABEL: hadd_reverse_v16f32: 345; AVX2: # %bb.0: 346; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 347; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] 348; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1] 349; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 350; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 351; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1] 352; AVX2-NEXT: vmovaps %ymm3, %ymm0 353; AVX2-NEXT: retq 354 %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17> 355 %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16> 356 %fadd = fadd <16 x float> %lhs, %rhs 357 ret <16 x float> %fadd 358} 359 360define <16 x float> @hadd_reverse2_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { 361; SSE-LABEL: hadd_reverse2_v16f32: 362; SSE: # %bb.0: 363; SSE-NEXT: movaps %xmm1, %xmm8 364; SSE-NEXT: movaps %xmm0, %xmm9 365; SSE-NEXT: haddps %xmm7, %xmm3 366; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] 367; SSE-NEXT: haddps %xmm6, %xmm2 368; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2] 369; SSE-NEXT: haddps %xmm5, %xmm8 370; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0,3,2] 371; SSE-NEXT: haddps %xmm4, %xmm9 372; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0,3,2] 373; SSE-NEXT: movaps %xmm3, %xmm0 374; SSE-NEXT: movaps %xmm2, %xmm1 375; SSE-NEXT: movaps %xmm8, %xmm2 376; SSE-NEXT: movaps %xmm9, %xmm3 377; SSE-NEXT: retq 378; 379; AVX1-LABEL: hadd_reverse2_v16f32: 380; AVX1: # %bb.0: 381; AVX1-NEXT: vhaddps %ymm3, %ymm1, %ymm1 382; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] 383; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0,3,2,5,4,7,6] 384; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 385; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 386; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6] 387; AVX1-NEXT: vmovaps %ymm3, %ymm0 388; AVX1-NEXT: retq 389; 390; AVX2-LABEL: hadd_reverse2_v16f32: 391; AVX2: # %bb.0: 392; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 393; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] 394; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] 395; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 396; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 397; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] 398; AVX2-NEXT: vmovaps %ymm3, %ymm0 399; AVX2-NEXT: retq 400 %shuf0 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 401 %shuf1 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 402 %lhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30> 403 %rhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31> 404 %fadd = fadd <16 x float> %lhs, %rhs 405 ret <16 x float> %fadd 406} 407