1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 7 8; 9; 128-bit vectors 10; 11 12define <16 x i8> @test_fixed_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind { 13; SSE-LABEL: test_fixed_v16i8: 14; SSE: # %bb.0: 15; SSE-NEXT: movdqa %xmm0, %xmm2 16; SSE-NEXT: pand %xmm1, %xmm2 17; SSE-NEXT: pxor %xmm1, %xmm0 18; SSE-NEXT: psrlw $1, %xmm0 19; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 20; SSE-NEXT: movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 21; SSE-NEXT: pxor %xmm1, %xmm0 22; SSE-NEXT: paddb %xmm2, %xmm0 23; SSE-NEXT: psubb %xmm1, %xmm0 24; SSE-NEXT: retq 25; 26; AVX1-LABEL: test_fixed_v16i8: 27; AVX1: # %bb.0: 28; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 29; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 30; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 31; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 32; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 33; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 34; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 35; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 36; AVX1-NEXT: retq 37; 38; AVX2-LABEL: test_fixed_v16i8: 39; AVX2: # %bb.0: 40; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 41; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 42; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 43; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 44; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 45; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 46; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 47; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 48; AVX2-NEXT: retq 49; 50; AVX512-LABEL: test_fixed_v16i8: 51; AVX512: # %bb.0: 52; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 53; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 54; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0 55; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 56; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 57; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0 58; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 59; AVX512-NEXT: retq 60 %and = and <16 x i8> %a0, %a1 61 %xor = xor <16 x i8> %a0, %a1 62 %shift = ashr <16 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 63 %res = add <16 x i8> %and, %shift 64 ret <16 x i8> %res 65} 66 67define <16 x i8> @test_ext_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind { 68; SSE-LABEL: test_ext_v16i8: 69; SSE: # %bb.0: 70; SSE-NEXT: movdqa %xmm0, %xmm2 71; SSE-NEXT: pand %xmm1, %xmm2 72; SSE-NEXT: pxor %xmm1, %xmm0 73; SSE-NEXT: psrlw $1, %xmm0 74; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 75; SSE-NEXT: movdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 76; SSE-NEXT: pxor %xmm1, %xmm0 77; SSE-NEXT: paddb %xmm2, %xmm0 78; SSE-NEXT: psubb %xmm1, %xmm0 79; SSE-NEXT: retq 80; 81; AVX1-LABEL: test_ext_v16i8: 82; AVX1: # %bb.0: 83; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 84; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 85; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 86; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 87; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 88; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 89; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 90; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 91; AVX1-NEXT: retq 92; 93; AVX2-LABEL: test_ext_v16i8: 94; AVX2: # %bb.0: 95; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 96; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 97; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 98; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 99; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 100; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 101; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 102; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 103; AVX2-NEXT: retq 104; 105; AVX512-LABEL: test_ext_v16i8: 106; AVX512: # %bb.0: 107; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 108; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 109; AVX512-NEXT: vpsrlw $1, %xmm0, %xmm0 110; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 111; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 112; AVX512-NEXT: vpaddb %xmm2, %xmm0, %xmm0 113; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 114; AVX512-NEXT: retq 115 %x0 = sext <16 x i8> %a0 to <16 x i16> 116 %x1 = sext <16 x i8> %a1 to <16 x i16> 117 %sum = add <16 x i16> %x0, %x1 118 %shift = ashr <16 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 119 %res = trunc <16 x i16> %shift to <16 x i8> 120 ret <16 x i8> %res 121} 122 123define <8 x i16> @test_fixed_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { 124; SSE-LABEL: test_fixed_v8i16: 125; SSE: # %bb.0: 126; SSE-NEXT: movdqa %xmm0, %xmm2 127; SSE-NEXT: pand %xmm1, %xmm2 128; SSE-NEXT: pxor %xmm1, %xmm0 129; SSE-NEXT: psraw $1, %xmm0 130; SSE-NEXT: paddw %xmm2, %xmm0 131; SSE-NEXT: retq 132; 133; AVX-LABEL: test_fixed_v8i16: 134; AVX: # %bb.0: 135; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 136; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 137; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 138; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 139; AVX-NEXT: retq 140 %and = and <8 x i16> %a0, %a1 141 %xor = xor <8 x i16> %a1, %a0 142 %shift = ashr <8 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 143 %res = add <8 x i16> %and, %shift 144 ret <8 x i16> %res 145} 146 147define <8 x i16> @test_ext_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind { 148; SSE-LABEL: test_ext_v8i16: 149; SSE: # %bb.0: 150; SSE-NEXT: movdqa %xmm0, %xmm2 151; SSE-NEXT: pand %xmm1, %xmm2 152; SSE-NEXT: pxor %xmm1, %xmm0 153; SSE-NEXT: psraw $1, %xmm0 154; SSE-NEXT: paddw %xmm2, %xmm0 155; SSE-NEXT: retq 156; 157; AVX-LABEL: test_ext_v8i16: 158; AVX: # %bb.0: 159; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 160; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 161; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 162; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0 163; AVX-NEXT: retq 164 %x0 = sext <8 x i16> %a0 to <8 x i32> 165 %x1 = sext <8 x i16> %a1 to <8 x i32> 166 %sum = add <8 x i32> %x0, %x1 167 %shift = ashr <8 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 168 %res = trunc <8 x i32> %shift to <8 x i16> 169 ret <8 x i16> %res 170} 171 172define <4 x i32> @test_fixed_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 173; SSE-LABEL: test_fixed_v4i32: 174; SSE: # %bb.0: 175; SSE-NEXT: movdqa %xmm0, %xmm2 176; SSE-NEXT: pand %xmm1, %xmm2 177; SSE-NEXT: pxor %xmm1, %xmm0 178; SSE-NEXT: psrad $1, %xmm0 179; SSE-NEXT: paddd %xmm2, %xmm0 180; SSE-NEXT: retq 181; 182; AVX-LABEL: test_fixed_v4i32: 183; AVX: # %bb.0: 184; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 185; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 186; AVX-NEXT: vpsrad $1, %xmm0, %xmm0 187; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0 188; AVX-NEXT: retq 189 %and = and <4 x i32> %a0, %a1 190 %xor = xor <4 x i32> %a1, %a0 191 %shift = ashr <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1> 192 %res = add <4 x i32> %and, %shift 193 ret <4 x i32> %res 194} 195 196define <4 x i32> @test_ext_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 197; SSE-LABEL: test_ext_v4i32: 198; SSE: # %bb.0: 199; SSE-NEXT: movdqa %xmm0, %xmm2 200; SSE-NEXT: pand %xmm1, %xmm2 201; SSE-NEXT: pxor %xmm1, %xmm0 202; SSE-NEXT: psrad $1, %xmm0 203; SSE-NEXT: paddd %xmm2, %xmm0 204; SSE-NEXT: retq 205; 206; AVX-LABEL: test_ext_v4i32: 207; AVX: # %bb.0: 208; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 209; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 210; AVX-NEXT: vpsrad $1, %xmm0, %xmm0 211; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0 212; AVX-NEXT: retq 213 %x0 = sext <4 x i32> %a0 to <4 x i64> 214 %x1 = sext <4 x i32> %a1 to <4 x i64> 215 %sum = add <4 x i64> %x0, %x1 216 %shift = ashr <4 x i64> %sum, <i64 1, i64 1, i64 1, i64 1> 217 %res = trunc <4 x i64> %shift to <4 x i32> 218 ret <4 x i32> %res 219} 220 221define <2 x i64> @test_fixed_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind { 222; SSE2-LABEL: test_fixed_v2i64: 223; SSE2: # %bb.0: 224; SSE2-NEXT: movdqa %xmm0, %xmm2 225; SSE2-NEXT: pxor %xmm1, %xmm2 226; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 227; SSE2-NEXT: psrad $1, %xmm3 228; SSE2-NEXT: psrlq $1, %xmm2 229; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 230; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 231; SSE2-NEXT: pand %xmm1, %xmm0 232; SSE2-NEXT: paddq %xmm2, %xmm0 233; SSE2-NEXT: retq 234; 235; SSE4-LABEL: test_fixed_v2i64: 236; SSE4: # %bb.0: 237; SSE4-NEXT: movdqa %xmm0, %xmm2 238; SSE4-NEXT: pxor %xmm1, %xmm2 239; SSE4-NEXT: movdqa %xmm2, %xmm3 240; SSE4-NEXT: psrad $1, %xmm3 241; SSE4-NEXT: psrlq $1, %xmm2 242; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 243; SSE4-NEXT: pand %xmm1, %xmm0 244; SSE4-NEXT: paddq %xmm2, %xmm0 245; SSE4-NEXT: retq 246; 247; AVX1-LABEL: test_fixed_v2i64: 248; AVX1: # %bb.0: 249; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 250; AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 251; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm2 252; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 253; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 254; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 255; AVX1-NEXT: retq 256; 257; AVX2-LABEL: test_fixed_v2i64: 258; AVX2: # %bb.0: 259; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 260; AVX2-NEXT: vpsrad $1, %xmm2, %xmm3 261; AVX2-NEXT: vpsrlq $1, %xmm2, %xmm2 262; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] 263; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 264; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 265; AVX2-NEXT: retq 266; 267; AVX512-LABEL: test_fixed_v2i64: 268; AVX512: # %bb.0: 269; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 270; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 271; AVX512-NEXT: vpsraq $1, %xmm0, %xmm0 272; AVX512-NEXT: vpaddq %xmm0, %xmm2, %xmm0 273; AVX512-NEXT: retq 274 %and = and <2 x i64> %a0, %a1 275 %xor = xor <2 x i64> %a1, %a0 276 %shift = ashr <2 x i64> %xor, <i64 1, i64 1> 277 %res = add <2 x i64> %and, %shift 278 ret <2 x i64> %res 279} 280 281define <2 x i64> @test_ext_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind { 282; SSE2-LABEL: test_ext_v2i64: 283; SSE2: # %bb.0: 284; SSE2-NEXT: movdqa %xmm0, %xmm2 285; SSE2-NEXT: pxor %xmm1, %xmm2 286; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 287; SSE2-NEXT: psrad $1, %xmm3 288; SSE2-NEXT: psrlq $1, %xmm2 289; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 290; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 291; SSE2-NEXT: pand %xmm1, %xmm0 292; SSE2-NEXT: paddq %xmm2, %xmm0 293; SSE2-NEXT: retq 294; 295; SSE4-LABEL: test_ext_v2i64: 296; SSE4: # %bb.0: 297; SSE4-NEXT: movdqa %xmm0, %xmm2 298; SSE4-NEXT: pxor %xmm1, %xmm2 299; SSE4-NEXT: movdqa %xmm2, %xmm3 300; SSE4-NEXT: psrad $1, %xmm3 301; SSE4-NEXT: psrlq $1, %xmm2 302; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 303; SSE4-NEXT: pand %xmm1, %xmm0 304; SSE4-NEXT: paddq %xmm2, %xmm0 305; SSE4-NEXT: retq 306; 307; AVX1-LABEL: test_ext_v2i64: 308; AVX1: # %bb.0: 309; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 310; AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 311; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm2 312; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 313; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 314; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 315; AVX1-NEXT: retq 316; 317; AVX2-LABEL: test_ext_v2i64: 318; AVX2: # %bb.0: 319; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 320; AVX2-NEXT: vpsrad $1, %xmm2, %xmm3 321; AVX2-NEXT: vpsrlq $1, %xmm2, %xmm2 322; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] 323; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 324; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 325; AVX2-NEXT: retq 326; 327; AVX512-LABEL: test_ext_v2i64: 328; AVX512: # %bb.0: 329; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 330; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 331; AVX512-NEXT: vpsraq $1, %xmm0, %xmm0 332; AVX512-NEXT: vpaddq %xmm0, %xmm2, %xmm0 333; AVX512-NEXT: retq 334 %x0 = sext <2 x i64> %a0 to <2 x i128> 335 %x1 = sext <2 x i64> %a1 to <2 x i128> 336 %sum = add <2 x i128> %x0, %x1 337 %shift = ashr <2 x i128> %sum, <i128 1, i128 1> 338 %res = trunc <2 x i128> %shift to <2 x i64> 339 ret <2 x i64> %res 340} 341 342; 343; 256-bit vectors 344; 345 346define <32 x i8> @test_fixed_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind { 347; SSE-LABEL: test_fixed_v32i8: 348; SSE: # %bb.0: 349; SSE-NEXT: movdqa %xmm0, %xmm4 350; SSE-NEXT: pand %xmm2, %xmm4 351; SSE-NEXT: pxor %xmm2, %xmm0 352; SSE-NEXT: psrlw $1, %xmm0 353; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 354; SSE-NEXT: pand %xmm2, %xmm0 355; SSE-NEXT: movdqa {{.*#+}} xmm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 356; SSE-NEXT: pxor %xmm5, %xmm0 357; SSE-NEXT: paddb %xmm4, %xmm0 358; SSE-NEXT: psubb %xmm5, %xmm0 359; SSE-NEXT: movdqa %xmm1, %xmm4 360; SSE-NEXT: pand %xmm3, %xmm4 361; SSE-NEXT: pxor %xmm3, %xmm1 362; SSE-NEXT: psrlw $1, %xmm1 363; SSE-NEXT: pand %xmm2, %xmm1 364; SSE-NEXT: pxor %xmm5, %xmm1 365; SSE-NEXT: paddb %xmm4, %xmm1 366; SSE-NEXT: psubb %xmm5, %xmm1 367; SSE-NEXT: retq 368; 369; AVX1-LABEL: test_fixed_v32i8: 370; AVX1: # %bb.0: 371; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm2 372; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 373; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 374; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 375; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 376; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 377; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 378; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 379; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 380; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 381; AVX1-NEXT: vpsubb %xmm5, %xmm1, %xmm1 382; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 383; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 384; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 385; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 386; AVX1-NEXT: vpsubb %xmm5, %xmm0, %xmm0 387; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 388; AVX1-NEXT: retq 389; 390; AVX2-LABEL: test_fixed_v32i8: 391; AVX2: # %bb.0: 392; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 393; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 394; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 395; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 396; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 397; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 398; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 399; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 400; AVX2-NEXT: retq 401; 402; AVX512-LABEL: test_fixed_v32i8: 403; AVX512: # %bb.0: 404; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 405; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 406; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 407; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 408; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 409; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0 410; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 411; AVX512-NEXT: retq 412 %and = and <32 x i8> %a0, %a1 413 %xor = xor <32 x i8> %a0, %a1 414 %shift = ashr <32 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 415 %res = add <32 x i8> %and, %shift 416 ret <32 x i8> %res 417} 418 419define <32 x i8> @test_ext_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind { 420; SSE-LABEL: test_ext_v32i8: 421; SSE: # %bb.0: 422; SSE-NEXT: movdqa %xmm0, %xmm4 423; SSE-NEXT: pand %xmm2, %xmm4 424; SSE-NEXT: pxor %xmm2, %xmm0 425; SSE-NEXT: psrlw $1, %xmm0 426; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 427; SSE-NEXT: pand %xmm2, %xmm0 428; SSE-NEXT: movdqa {{.*#+}} xmm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 429; SSE-NEXT: pxor %xmm5, %xmm0 430; SSE-NEXT: paddb %xmm4, %xmm0 431; SSE-NEXT: psubb %xmm5, %xmm0 432; SSE-NEXT: movdqa %xmm1, %xmm4 433; SSE-NEXT: pand %xmm3, %xmm4 434; SSE-NEXT: pxor %xmm3, %xmm1 435; SSE-NEXT: psrlw $1, %xmm1 436; SSE-NEXT: pand %xmm2, %xmm1 437; SSE-NEXT: pxor %xmm5, %xmm1 438; SSE-NEXT: paddb %xmm4, %xmm1 439; SSE-NEXT: psubb %xmm5, %xmm1 440; SSE-NEXT: retq 441; 442; AVX1-LABEL: test_ext_v32i8: 443; AVX1: # %bb.0: 444; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm2 445; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 446; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 447; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 448; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 449; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 450; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3 451; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 452; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 453; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 454; AVX1-NEXT: vpsubb %xmm5, %xmm1, %xmm1 455; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 456; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 457; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 458; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0 459; AVX1-NEXT: vpsubb %xmm5, %xmm0, %xmm0 460; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 461; AVX1-NEXT: retq 462; 463; AVX2-LABEL: test_ext_v32i8: 464; AVX2: # %bb.0: 465; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 466; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 467; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 468; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 469; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 470; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 471; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 472; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 473; AVX2-NEXT: retq 474; 475; AVX512-LABEL: test_ext_v32i8: 476; AVX512: # %bb.0: 477; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 478; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 479; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 480; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 481; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 482; AVX512-NEXT: vpaddb %ymm2, %ymm0, %ymm0 483; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 484; AVX512-NEXT: retq 485 %x0 = sext <32 x i8> %a0 to <32 x i16> 486 %x1 = sext <32 x i8> %a1 to <32 x i16> 487 %sum = add <32 x i16> %x0, %x1 488 %shift = ashr <32 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 489 %res = trunc <32 x i16> %shift to <32 x i8> 490 ret <32 x i8> %res 491} 492 493define <16 x i16> @test_fixed_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { 494; SSE-LABEL: test_fixed_v16i16: 495; SSE: # %bb.0: 496; SSE-NEXT: movdqa %xmm0, %xmm4 497; SSE-NEXT: pand %xmm2, %xmm4 498; SSE-NEXT: pxor %xmm2, %xmm0 499; SSE-NEXT: psraw $1, %xmm0 500; SSE-NEXT: paddw %xmm4, %xmm0 501; SSE-NEXT: movdqa %xmm1, %xmm2 502; SSE-NEXT: pand %xmm3, %xmm2 503; SSE-NEXT: pxor %xmm3, %xmm1 504; SSE-NEXT: psraw $1, %xmm1 505; SSE-NEXT: paddw %xmm2, %xmm1 506; SSE-NEXT: retq 507; 508; AVX1-LABEL: test_fixed_v16i16: 509; AVX1: # %bb.0: 510; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm2 511; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 512; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 513; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 514; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 515; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 516; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 517; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 518; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 519; AVX1-NEXT: retq 520; 521; AVX2-LABEL: test_fixed_v16i16: 522; AVX2: # %bb.0: 523; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 524; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 525; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0 526; AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0 527; AVX2-NEXT: retq 528; 529; AVX512-LABEL: test_fixed_v16i16: 530; AVX512: # %bb.0: 531; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 532; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 533; AVX512-NEXT: vpsraw $1, %ymm0, %ymm0 534; AVX512-NEXT: vpaddw %ymm0, %ymm2, %ymm0 535; AVX512-NEXT: retq 536 %and = and <16 x i16> %a0, %a1 537 %xor = xor <16 x i16> %a1, %a0 538 %shift = ashr <16 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 539 %res = add <16 x i16> %and, %shift 540 ret <16 x i16> %res 541} 542 543define <16 x i16> @test_ext_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind { 544; SSE-LABEL: test_ext_v16i16: 545; SSE: # %bb.0: 546; SSE-NEXT: movdqa %xmm0, %xmm4 547; SSE-NEXT: pand %xmm2, %xmm4 548; SSE-NEXT: pxor %xmm2, %xmm0 549; SSE-NEXT: psraw $1, %xmm0 550; SSE-NEXT: paddw %xmm4, %xmm0 551; SSE-NEXT: movdqa %xmm1, %xmm2 552; SSE-NEXT: pand %xmm3, %xmm2 553; SSE-NEXT: pxor %xmm3, %xmm1 554; SSE-NEXT: psraw $1, %xmm1 555; SSE-NEXT: paddw %xmm2, %xmm1 556; SSE-NEXT: retq 557; 558; AVX1-LABEL: test_ext_v16i16: 559; AVX1: # %bb.0: 560; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm2 561; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 562; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 563; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 564; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 565; AVX1-NEXT: vpaddw %xmm1, %xmm3, %xmm1 566; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 567; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0 568; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 569; AVX1-NEXT: retq 570; 571; AVX2-LABEL: test_ext_v16i16: 572; AVX2: # %bb.0: 573; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 574; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 575; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0 576; AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0 577; AVX2-NEXT: retq 578; 579; AVX512-LABEL: test_ext_v16i16: 580; AVX512: # %bb.0: 581; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 582; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 583; AVX512-NEXT: vpsraw $1, %ymm0, %ymm0 584; AVX512-NEXT: vpaddw %ymm0, %ymm2, %ymm0 585; AVX512-NEXT: retq 586 %x0 = sext <16 x i16> %a0 to <16 x i32> 587 %x1 = sext <16 x i16> %a1 to <16 x i32> 588 %sum = add <16 x i32> %x0, %x1 589 %shift = ashr <16 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 590 %res = trunc <16 x i32> %shift to <16 x i16> 591 ret <16 x i16> %res 592} 593 594define <8 x i32> @test_fixed_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind { 595; SSE-LABEL: test_fixed_v8i32: 596; SSE: # %bb.0: 597; SSE-NEXT: movdqa %xmm0, %xmm4 598; SSE-NEXT: pand %xmm2, %xmm4 599; SSE-NEXT: pxor %xmm2, %xmm0 600; SSE-NEXT: psrad $1, %xmm0 601; SSE-NEXT: paddd %xmm4, %xmm0 602; SSE-NEXT: movdqa %xmm1, %xmm2 603; SSE-NEXT: pand %xmm3, %xmm2 604; SSE-NEXT: pxor %xmm3, %xmm1 605; SSE-NEXT: psrad $1, %xmm1 606; SSE-NEXT: paddd %xmm2, %xmm1 607; SSE-NEXT: retq 608; 609; AVX1-LABEL: test_fixed_v8i32: 610; AVX1: # %bb.0: 611; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm2 612; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 613; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 614; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 615; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1 616; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 617; AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 618; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 619; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 620; AVX1-NEXT: retq 621; 622; AVX2-LABEL: test_fixed_v8i32: 623; AVX2: # %bb.0: 624; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 625; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 626; AVX2-NEXT: vpsrad $1, %ymm0, %ymm0 627; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 628; AVX2-NEXT: retq 629; 630; AVX512-LABEL: test_fixed_v8i32: 631; AVX512: # %bb.0: 632; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 633; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 634; AVX512-NEXT: vpsrad $1, %ymm0, %ymm0 635; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 636; AVX512-NEXT: retq 637 %and = and <8 x i32> %a0, %a1 638 %xor = xor <8 x i32> %a1, %a0 639 %shift = ashr <8 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 640 %res = add <8 x i32> %and, %shift 641 ret <8 x i32> %res 642} 643 644define <8 x i32> @test_ext_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind { 645; SSE-LABEL: test_ext_v8i32: 646; SSE: # %bb.0: 647; SSE-NEXT: movdqa %xmm0, %xmm4 648; SSE-NEXT: pand %xmm2, %xmm4 649; SSE-NEXT: pxor %xmm2, %xmm0 650; SSE-NEXT: psrad $1, %xmm0 651; SSE-NEXT: paddd %xmm4, %xmm0 652; SSE-NEXT: movdqa %xmm1, %xmm2 653; SSE-NEXT: pand %xmm3, %xmm2 654; SSE-NEXT: pxor %xmm3, %xmm1 655; SSE-NEXT: psrad $1, %xmm1 656; SSE-NEXT: paddd %xmm2, %xmm1 657; SSE-NEXT: retq 658; 659; AVX1-LABEL: test_ext_v8i32: 660; AVX1: # %bb.0: 661; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm2 662; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 663; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 664; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 665; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1 666; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 667; AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 668; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 669; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 670; AVX1-NEXT: retq 671; 672; AVX2-LABEL: test_ext_v8i32: 673; AVX2: # %bb.0: 674; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 675; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 676; AVX2-NEXT: vpsrad $1, %ymm0, %ymm0 677; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 678; AVX2-NEXT: retq 679; 680; AVX512-LABEL: test_ext_v8i32: 681; AVX512: # %bb.0: 682; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 683; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 684; AVX512-NEXT: vpsrad $1, %ymm0, %ymm0 685; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 686; AVX512-NEXT: retq 687 %x0 = sext <8 x i32> %a0 to <8 x i64> 688 %x1 = sext <8 x i32> %a1 to <8 x i64> 689 %sum = add <8 x i64> %x0, %x1 690 %shift = ashr <8 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 691 %res = trunc <8 x i64> %shift to <8 x i32> 692 ret <8 x i32> %res 693} 694 695define <4 x i64> @test_fixed_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 696; SSE2-LABEL: test_fixed_v4i64: 697; SSE2: # %bb.0: 698; SSE2-NEXT: movdqa %xmm0, %xmm4 699; SSE2-NEXT: pxor %xmm2, %xmm4 700; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] 701; SSE2-NEXT: psrad $1, %xmm5 702; SSE2-NEXT: psrlq $1, %xmm4 703; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 704; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 705; SSE2-NEXT: pand %xmm2, %xmm0 706; SSE2-NEXT: paddq %xmm4, %xmm0 707; SSE2-NEXT: movdqa %xmm1, %xmm2 708; SSE2-NEXT: pxor %xmm3, %xmm2 709; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 710; SSE2-NEXT: psrad $1, %xmm4 711; SSE2-NEXT: psrlq $1, %xmm2 712; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 713; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 714; SSE2-NEXT: pand %xmm3, %xmm1 715; SSE2-NEXT: paddq %xmm2, %xmm1 716; SSE2-NEXT: retq 717; 718; SSE4-LABEL: test_fixed_v4i64: 719; SSE4: # %bb.0: 720; SSE4-NEXT: movdqa %xmm0, %xmm4 721; SSE4-NEXT: pxor %xmm2, %xmm4 722; SSE4-NEXT: movdqa %xmm4, %xmm5 723; SSE4-NEXT: psrad $1, %xmm5 724; SSE4-NEXT: psrlq $1, %xmm4 725; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 726; SSE4-NEXT: pand %xmm2, %xmm0 727; SSE4-NEXT: paddq %xmm4, %xmm0 728; SSE4-NEXT: movdqa %xmm1, %xmm2 729; SSE4-NEXT: pxor %xmm3, %xmm2 730; SSE4-NEXT: movdqa %xmm2, %xmm4 731; SSE4-NEXT: psrad $1, %xmm4 732; SSE4-NEXT: psrlq $1, %xmm2 733; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 734; SSE4-NEXT: pand %xmm3, %xmm1 735; SSE4-NEXT: paddq %xmm2, %xmm1 736; SSE4-NEXT: retq 737; 738; AVX1-LABEL: test_fixed_v4i64: 739; AVX1: # %bb.0: 740; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm2 741; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 742; AVX1-NEXT: vpsrad $1, %xmm3, %xmm4 743; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm3 744; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 745; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 746; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 747; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 748; AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 749; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm2 750; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 751; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 752; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 753; AVX1-NEXT: retq 754; 755; AVX2-LABEL: test_fixed_v4i64: 756; AVX2: # %bb.0: 757; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm2 758; AVX2-NEXT: vpsrad $1, %ymm2, %ymm3 759; AVX2-NEXT: vpsrlq $1, %ymm2, %ymm2 760; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] 761; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 762; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 763; AVX2-NEXT: retq 764; 765; AVX512-LABEL: test_fixed_v4i64: 766; AVX512: # %bb.0: 767; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 768; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 769; AVX512-NEXT: vpsraq $1, %ymm0, %ymm0 770; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 771; AVX512-NEXT: retq 772 %and = and <4 x i64> %a0, %a1 773 %xor = xor <4 x i64> %a1, %a0 774 %shift = ashr <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1> 775 %res = add <4 x i64> %and, %shift 776 ret <4 x i64> %res 777} 778 779define <4 x i64> @test_ext_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind { 780; SSE2-LABEL: test_ext_v4i64: 781; SSE2: # %bb.0: 782; SSE2-NEXT: movdqa %xmm0, %xmm4 783; SSE2-NEXT: pxor %xmm2, %xmm4 784; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] 785; SSE2-NEXT: psrad $1, %xmm5 786; SSE2-NEXT: psrlq $1, %xmm4 787; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 788; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 789; SSE2-NEXT: pand %xmm2, %xmm0 790; SSE2-NEXT: paddq %xmm4, %xmm0 791; SSE2-NEXT: movdqa %xmm1, %xmm2 792; SSE2-NEXT: pxor %xmm3, %xmm2 793; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 794; SSE2-NEXT: psrad $1, %xmm4 795; SSE2-NEXT: psrlq $1, %xmm2 796; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 797; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 798; SSE2-NEXT: pand %xmm3, %xmm1 799; SSE2-NEXT: paddq %xmm2, %xmm1 800; SSE2-NEXT: retq 801; 802; SSE4-LABEL: test_ext_v4i64: 803; SSE4: # %bb.0: 804; SSE4-NEXT: movdqa %xmm0, %xmm4 805; SSE4-NEXT: pxor %xmm2, %xmm4 806; SSE4-NEXT: movdqa %xmm4, %xmm5 807; SSE4-NEXT: psrad $1, %xmm5 808; SSE4-NEXT: psrlq $1, %xmm4 809; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 810; SSE4-NEXT: pand %xmm2, %xmm0 811; SSE4-NEXT: paddq %xmm4, %xmm0 812; SSE4-NEXT: movdqa %xmm1, %xmm2 813; SSE4-NEXT: pxor %xmm3, %xmm2 814; SSE4-NEXT: movdqa %xmm2, %xmm4 815; SSE4-NEXT: psrad $1, %xmm4 816; SSE4-NEXT: psrlq $1, %xmm2 817; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 818; SSE4-NEXT: pand %xmm3, %xmm1 819; SSE4-NEXT: paddq %xmm2, %xmm1 820; SSE4-NEXT: retq 821; 822; AVX1-LABEL: test_ext_v4i64: 823; AVX1: # %bb.0: 824; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm2 825; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 826; AVX1-NEXT: vpsrad $1, %xmm3, %xmm4 827; AVX1-NEXT: vpsrlq $1, %xmm3, %xmm3 828; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 829; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 830; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 831; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 832; AVX1-NEXT: vpsrad $1, %xmm2, %xmm3 833; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm2 834; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 835; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 836; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 837; AVX1-NEXT: retq 838; 839; AVX2-LABEL: test_ext_v4i64: 840; AVX2: # %bb.0: 841; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm2 842; AVX2-NEXT: vpsrad $1, %ymm2, %ymm3 843; AVX2-NEXT: vpsrlq $1, %ymm2, %ymm2 844; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] 845; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 846; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 847; AVX2-NEXT: retq 848; 849; AVX512-LABEL: test_ext_v4i64: 850; AVX512: # %bb.0: 851; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 852; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 853; AVX512-NEXT: vpsraq $1, %ymm0, %ymm0 854; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 855; AVX512-NEXT: retq 856 %x0 = sext <4 x i64> %a0 to <4 x i128> 857 %x1 = sext <4 x i64> %a1 to <4 x i128> 858 %sum = add <4 x i128> %x0, %x1 859 %shift = ashr <4 x i128> %sum, <i128 1, i128 1, i128 1, i128 1> 860 %res = trunc <4 x i128> %shift to <4 x i64> 861 ret <4 x i64> %res 862} 863 864; 865; 512-bit vectors 866; 867 868define <64 x i8> @test_fixed_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { 869; SSE-LABEL: test_fixed_v64i8: 870; SSE: # %bb.0: 871; SSE-NEXT: movdqa %xmm0, %xmm9 872; SSE-NEXT: pand %xmm4, %xmm9 873; SSE-NEXT: pxor %xmm4, %xmm0 874; SSE-NEXT: psrlw $1, %xmm0 875; SSE-NEXT: movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 876; SSE-NEXT: pand %xmm8, %xmm0 877; SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 878; SSE-NEXT: pxor %xmm4, %xmm0 879; SSE-NEXT: paddb %xmm9, %xmm0 880; SSE-NEXT: psubb %xmm4, %xmm0 881; SSE-NEXT: movdqa %xmm1, %xmm9 882; SSE-NEXT: pand %xmm5, %xmm9 883; SSE-NEXT: pxor %xmm5, %xmm1 884; SSE-NEXT: psrlw $1, %xmm1 885; SSE-NEXT: pand %xmm8, %xmm1 886; SSE-NEXT: pxor %xmm4, %xmm1 887; SSE-NEXT: paddb %xmm9, %xmm1 888; SSE-NEXT: psubb %xmm4, %xmm1 889; SSE-NEXT: movdqa %xmm2, %xmm5 890; SSE-NEXT: pand %xmm6, %xmm5 891; SSE-NEXT: pxor %xmm6, %xmm2 892; SSE-NEXT: psrlw $1, %xmm2 893; SSE-NEXT: pand %xmm8, %xmm2 894; SSE-NEXT: pxor %xmm4, %xmm2 895; SSE-NEXT: paddb %xmm5, %xmm2 896; SSE-NEXT: psubb %xmm4, %xmm2 897; SSE-NEXT: movdqa %xmm3, %xmm5 898; SSE-NEXT: pand %xmm7, %xmm5 899; SSE-NEXT: pxor %xmm7, %xmm3 900; SSE-NEXT: psrlw $1, %xmm3 901; SSE-NEXT: pand %xmm8, %xmm3 902; SSE-NEXT: pxor %xmm4, %xmm3 903; SSE-NEXT: paddb %xmm5, %xmm3 904; SSE-NEXT: psubb %xmm4, %xmm3 905; SSE-NEXT: retq 906; 907; AVX1-LABEL: test_fixed_v64i8: 908; AVX1: # %bb.0: 909; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm4 910; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 911; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 912; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 913; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 914; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 915; AVX1-NEXT: vpxor %xmm7, %xmm5, %xmm5 916; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 917; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 918; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 919; AVX1-NEXT: vpsubb %xmm7, %xmm2, %xmm2 920; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 921; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 922; AVX1-NEXT: vpxor %xmm7, %xmm4, %xmm4 923; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 924; AVX1-NEXT: vpsubb %xmm7, %xmm0, %xmm0 925; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 926; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm2 927; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 928; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 929; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 930; AVX1-NEXT: vpxor %xmm7, %xmm4, %xmm4 931; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 932; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 933; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 934; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3 935; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 936; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 937; AVX1-NEXT: vpxor %xmm7, %xmm2, %xmm2 938; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 939; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1 940; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 941; AVX1-NEXT: retq 942; 943; AVX2-LABEL: test_fixed_v64i8: 944; AVX2: # %bb.0: 945; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm4 946; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 947; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 948; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 949; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 950; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 951; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm0 952; AVX2-NEXT: vpaddb %ymm4, %ymm0, %ymm0 953; AVX2-NEXT: vpsubb %ymm5, %ymm0, %ymm0 954; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm4 955; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 956; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 957; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 958; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1 959; AVX2-NEXT: vpaddb %ymm4, %ymm1, %ymm1 960; AVX2-NEXT: vpsubb %ymm5, %ymm1, %ymm1 961; AVX2-NEXT: retq 962; 963; AVX512-LABEL: test_fixed_v64i8: 964; AVX512: # %bb.0: 965; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2 966; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 967; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0 968; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 969; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 970; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0 971; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0 972; AVX512-NEXT: retq 973 %and = and <64 x i8> %a0, %a1 974 %xor = xor <64 x i8> %a0, %a1 975 %shift = ashr <64 x i8> %xor, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 976 %res = add <64 x i8> %and, %shift 977 ret <64 x i8> %res 978} 979 980define <64 x i8> @test_ext_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { 981; SSE-LABEL: test_ext_v64i8: 982; SSE: # %bb.0: 983; SSE-NEXT: movdqa %xmm0, %xmm9 984; SSE-NEXT: pand %xmm4, %xmm9 985; SSE-NEXT: pxor %xmm4, %xmm0 986; SSE-NEXT: psrlw $1, %xmm0 987; SSE-NEXT: movdqa {{.*#+}} xmm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 988; SSE-NEXT: pand %xmm8, %xmm0 989; SSE-NEXT: movdqa {{.*#+}} xmm4 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 990; SSE-NEXT: pxor %xmm4, %xmm0 991; SSE-NEXT: paddb %xmm9, %xmm0 992; SSE-NEXT: psubb %xmm4, %xmm0 993; SSE-NEXT: movdqa %xmm1, %xmm9 994; SSE-NEXT: pand %xmm5, %xmm9 995; SSE-NEXT: pxor %xmm5, %xmm1 996; SSE-NEXT: psrlw $1, %xmm1 997; SSE-NEXT: pand %xmm8, %xmm1 998; SSE-NEXT: pxor %xmm4, %xmm1 999; SSE-NEXT: paddb %xmm9, %xmm1 1000; SSE-NEXT: psubb %xmm4, %xmm1 1001; SSE-NEXT: movdqa %xmm2, %xmm5 1002; SSE-NEXT: pand %xmm6, %xmm5 1003; SSE-NEXT: pxor %xmm6, %xmm2 1004; SSE-NEXT: psrlw $1, %xmm2 1005; SSE-NEXT: pand %xmm8, %xmm2 1006; SSE-NEXT: pxor %xmm4, %xmm2 1007; SSE-NEXT: paddb %xmm5, %xmm2 1008; SSE-NEXT: psubb %xmm4, %xmm2 1009; SSE-NEXT: movdqa %xmm3, %xmm5 1010; SSE-NEXT: pand %xmm7, %xmm5 1011; SSE-NEXT: pxor %xmm7, %xmm3 1012; SSE-NEXT: psrlw $1, %xmm3 1013; SSE-NEXT: pand %xmm8, %xmm3 1014; SSE-NEXT: pxor %xmm4, %xmm3 1015; SSE-NEXT: paddb %xmm5, %xmm3 1016; SSE-NEXT: psubb %xmm4, %xmm3 1017; SSE-NEXT: retq 1018; 1019; AVX1-LABEL: test_ext_v64i8: 1020; AVX1: # %bb.0: 1021; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm4 1022; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 1023; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 1024; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1025; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 1026; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1027; AVX1-NEXT: vpxor %xmm7, %xmm5, %xmm5 1028; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1029; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1030; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2 1031; AVX1-NEXT: vpsubb %xmm7, %xmm2, %xmm2 1032; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 1033; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 1034; AVX1-NEXT: vpxor %xmm7, %xmm4, %xmm4 1035; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 1036; AVX1-NEXT: vpsubb %xmm7, %xmm0, %xmm0 1037; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1038; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm2 1039; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1040; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 1041; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 1042; AVX1-NEXT: vpxor %xmm7, %xmm4, %xmm4 1043; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 1044; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1045; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 1046; AVX1-NEXT: vpsubb %xmm7, %xmm3, %xmm3 1047; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 1048; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 1049; AVX1-NEXT: vpxor %xmm7, %xmm2, %xmm2 1050; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 1051; AVX1-NEXT: vpsubb %xmm7, %xmm1, %xmm1 1052; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1053; AVX1-NEXT: retq 1054; 1055; AVX2-LABEL: test_ext_v64i8: 1056; AVX2: # %bb.0: 1057; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm4 1058; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 1059; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 1060; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] 1061; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1062; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm5 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1063; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm0 1064; AVX2-NEXT: vpaddb %ymm4, %ymm0, %ymm0 1065; AVX2-NEXT: vpsubb %ymm5, %ymm0, %ymm0 1066; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm4 1067; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 1068; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 1069; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1070; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1 1071; AVX2-NEXT: vpaddb %ymm4, %ymm1, %ymm1 1072; AVX2-NEXT: vpsubb %ymm5, %ymm1, %ymm1 1073; AVX2-NEXT: retq 1074; 1075; AVX512-LABEL: test_ext_v64i8: 1076; AVX512: # %bb.0: 1077; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2 1078; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 1079; AVX512-NEXT: vpsrlw $1, %zmm0, %zmm0 1080; AVX512-NEXT: vpbroadcastb {{.*#+}} zmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] 1081; AVX512-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 1082; AVX512-NEXT: vpaddb %zmm2, %zmm0, %zmm0 1083; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm0 1084; AVX512-NEXT: retq 1085 %x0 = sext <64 x i8> %a0 to <64 x i16> 1086 %x1 = sext <64 x i8> %a1 to <64 x i16> 1087 %sum = add <64 x i16> %x0, %x1 1088 %shift = ashr <64 x i16> %sum, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1089 %res = trunc <64 x i16> %shift to <64 x i8> 1090 ret <64 x i8> %res 1091} 1092 1093define <32 x i16> @test_fixed_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind { 1094; SSE-LABEL: test_fixed_v32i16: 1095; SSE: # %bb.0: 1096; SSE-NEXT: movdqa %xmm0, %xmm8 1097; SSE-NEXT: pand %xmm4, %xmm8 1098; SSE-NEXT: pxor %xmm4, %xmm0 1099; SSE-NEXT: psraw $1, %xmm0 1100; SSE-NEXT: paddw %xmm8, %xmm0 1101; SSE-NEXT: movdqa %xmm1, %xmm4 1102; SSE-NEXT: pand %xmm5, %xmm4 1103; SSE-NEXT: pxor %xmm5, %xmm1 1104; SSE-NEXT: psraw $1, %xmm1 1105; SSE-NEXT: paddw %xmm4, %xmm1 1106; SSE-NEXT: movdqa %xmm2, %xmm4 1107; SSE-NEXT: pand %xmm6, %xmm4 1108; SSE-NEXT: pxor %xmm6, %xmm2 1109; SSE-NEXT: psraw $1, %xmm2 1110; SSE-NEXT: paddw %xmm4, %xmm2 1111; SSE-NEXT: movdqa %xmm3, %xmm4 1112; SSE-NEXT: pand %xmm7, %xmm4 1113; SSE-NEXT: pxor %xmm7, %xmm3 1114; SSE-NEXT: psraw $1, %xmm3 1115; SSE-NEXT: paddw %xmm4, %xmm3 1116; SSE-NEXT: retq 1117; 1118; AVX1-LABEL: test_fixed_v32i16: 1119; AVX1: # %bb.0: 1120; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm4 1121; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 1122; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 1123; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1124; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 1125; AVX1-NEXT: vpaddw %xmm2, %xmm5, %xmm2 1126; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 1127; AVX1-NEXT: vpaddw %xmm0, %xmm4, %xmm0 1128; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1129; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm2 1130; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1131; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 1132; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1133; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 1134; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3 1135; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 1136; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 1137; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1138; AVX1-NEXT: retq 1139; 1140; AVX2-LABEL: test_fixed_v32i16: 1141; AVX2: # %bb.0: 1142; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm4 1143; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 1144; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0 1145; AVX2-NEXT: vpaddw %ymm0, %ymm4, %ymm0 1146; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1147; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 1148; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 1149; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 1150; AVX2-NEXT: retq 1151; 1152; AVX512-LABEL: test_fixed_v32i16: 1153; AVX512: # %bb.0: 1154; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2 1155; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 1156; AVX512-NEXT: vpsraw $1, %zmm0, %zmm0 1157; AVX512-NEXT: vpaddw %zmm0, %zmm2, %zmm0 1158; AVX512-NEXT: retq 1159 %and = and <32 x i16> %a0, %a1 1160 %xor = xor <32 x i16> %a1, %a0 1161 %shift = ashr <32 x i16> %xor, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1162 %res = add <32 x i16> %and, %shift 1163 ret <32 x i16> %res 1164} 1165 1166define <32 x i16> @test_ext_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind { 1167; SSE-LABEL: test_ext_v32i16: 1168; SSE: # %bb.0: 1169; SSE-NEXT: movdqa %xmm0, %xmm8 1170; SSE-NEXT: pand %xmm4, %xmm8 1171; SSE-NEXT: pxor %xmm4, %xmm0 1172; SSE-NEXT: psraw $1, %xmm0 1173; SSE-NEXT: paddw %xmm8, %xmm0 1174; SSE-NEXT: movdqa %xmm1, %xmm4 1175; SSE-NEXT: pand %xmm5, %xmm4 1176; SSE-NEXT: pxor %xmm5, %xmm1 1177; SSE-NEXT: psraw $1, %xmm1 1178; SSE-NEXT: paddw %xmm4, %xmm1 1179; SSE-NEXT: movdqa %xmm2, %xmm4 1180; SSE-NEXT: pand %xmm6, %xmm4 1181; SSE-NEXT: pxor %xmm6, %xmm2 1182; SSE-NEXT: psraw $1, %xmm2 1183; SSE-NEXT: paddw %xmm4, %xmm2 1184; SSE-NEXT: movdqa %xmm3, %xmm4 1185; SSE-NEXT: pand %xmm7, %xmm4 1186; SSE-NEXT: pxor %xmm7, %xmm3 1187; SSE-NEXT: psraw $1, %xmm3 1188; SSE-NEXT: paddw %xmm4, %xmm3 1189; SSE-NEXT: retq 1190; 1191; AVX1-LABEL: test_ext_v32i16: 1192; AVX1: # %bb.0: 1193; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm4 1194; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 1195; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 1196; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1197; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 1198; AVX1-NEXT: vpaddw %xmm2, %xmm5, %xmm2 1199; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 1200; AVX1-NEXT: vpaddw %xmm0, %xmm4, %xmm0 1201; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1202; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm2 1203; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1204; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 1205; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1206; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 1207; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3 1208; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 1209; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 1210; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1211; AVX1-NEXT: retq 1212; 1213; AVX2-LABEL: test_ext_v32i16: 1214; AVX2: # %bb.0: 1215; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm4 1216; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 1217; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0 1218; AVX2-NEXT: vpaddw %ymm0, %ymm4, %ymm0 1219; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1220; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 1221; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1 1222; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 1223; AVX2-NEXT: retq 1224; 1225; AVX512-LABEL: test_ext_v32i16: 1226; AVX512: # %bb.0: 1227; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2 1228; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 1229; AVX512-NEXT: vpsraw $1, %zmm0, %zmm0 1230; AVX512-NEXT: vpaddw %zmm0, %zmm2, %zmm0 1231; AVX512-NEXT: retq 1232 %x0 = sext <32 x i16> %a0 to <32 x i32> 1233 %x1 = sext <32 x i16> %a1 to <32 x i32> 1234 %sum = add <32 x i32> %x0, %x1 1235 %shift = ashr <32 x i32> %sum, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1236 %res = trunc <32 x i32> %shift to <32 x i16> 1237 ret <32 x i16> %res 1238} 1239 1240define <16 x i32> @test_fixed_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1241; SSE-LABEL: test_fixed_v16i32: 1242; SSE: # %bb.0: 1243; SSE-NEXT: movdqa %xmm0, %xmm8 1244; SSE-NEXT: pand %xmm4, %xmm8 1245; SSE-NEXT: pxor %xmm4, %xmm0 1246; SSE-NEXT: psrad $1, %xmm0 1247; SSE-NEXT: paddd %xmm8, %xmm0 1248; SSE-NEXT: movdqa %xmm1, %xmm4 1249; SSE-NEXT: pand %xmm5, %xmm4 1250; SSE-NEXT: pxor %xmm5, %xmm1 1251; SSE-NEXT: psrad $1, %xmm1 1252; SSE-NEXT: paddd %xmm4, %xmm1 1253; SSE-NEXT: movdqa %xmm2, %xmm4 1254; SSE-NEXT: pand %xmm6, %xmm4 1255; SSE-NEXT: pxor %xmm6, %xmm2 1256; SSE-NEXT: psrad $1, %xmm2 1257; SSE-NEXT: paddd %xmm4, %xmm2 1258; SSE-NEXT: movdqa %xmm3, %xmm4 1259; SSE-NEXT: pand %xmm7, %xmm4 1260; SSE-NEXT: pxor %xmm7, %xmm3 1261; SSE-NEXT: psrad $1, %xmm3 1262; SSE-NEXT: paddd %xmm4, %xmm3 1263; SSE-NEXT: retq 1264; 1265; AVX1-LABEL: test_fixed_v16i32: 1266; AVX1: # %bb.0: 1267; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm4 1268; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 1269; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 1270; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1271; AVX1-NEXT: vpsrad $1, %xmm2, %xmm2 1272; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 1273; AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 1274; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 1275; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1276; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm2 1277; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1278; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 1279; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1280; AVX1-NEXT: vpsrad $1, %xmm3, %xmm3 1281; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 1282; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1 1283; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 1284; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1285; AVX1-NEXT: retq 1286; 1287; AVX2-LABEL: test_fixed_v16i32: 1288; AVX2: # %bb.0: 1289; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm4 1290; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 1291; AVX2-NEXT: vpsrad $1, %ymm0, %ymm0 1292; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0 1293; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1294; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 1295; AVX2-NEXT: vpsrad $1, %ymm1, %ymm1 1296; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 1297; AVX2-NEXT: retq 1298; 1299; AVX512-LABEL: test_fixed_v16i32: 1300; AVX512: # %bb.0: 1301; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm2 1302; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 1303; AVX512-NEXT: vpsrad $1, %zmm0, %zmm0 1304; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 1305; AVX512-NEXT: retq 1306 %and = and <16 x i32> %a0, %a1 1307 %xor = xor <16 x i32> %a1, %a0 1308 %shift = ashr <16 x i32> %xor, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1309 %res = add <16 x i32> %and, %shift 1310 ret <16 x i32> %res 1311} 1312 1313define <16 x i32> @test_ext_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1314; SSE-LABEL: test_ext_v16i32: 1315; SSE: # %bb.0: 1316; SSE-NEXT: movdqa %xmm0, %xmm8 1317; SSE-NEXT: pand %xmm4, %xmm8 1318; SSE-NEXT: pxor %xmm4, %xmm0 1319; SSE-NEXT: psrad $1, %xmm0 1320; SSE-NEXT: paddd %xmm8, %xmm0 1321; SSE-NEXT: movdqa %xmm1, %xmm4 1322; SSE-NEXT: pand %xmm5, %xmm4 1323; SSE-NEXT: pxor %xmm5, %xmm1 1324; SSE-NEXT: psrad $1, %xmm1 1325; SSE-NEXT: paddd %xmm4, %xmm1 1326; SSE-NEXT: movdqa %xmm2, %xmm4 1327; SSE-NEXT: pand %xmm6, %xmm4 1328; SSE-NEXT: pxor %xmm6, %xmm2 1329; SSE-NEXT: psrad $1, %xmm2 1330; SSE-NEXT: paddd %xmm4, %xmm2 1331; SSE-NEXT: movdqa %xmm3, %xmm4 1332; SSE-NEXT: pand %xmm7, %xmm4 1333; SSE-NEXT: pxor %xmm7, %xmm3 1334; SSE-NEXT: psrad $1, %xmm3 1335; SSE-NEXT: paddd %xmm4, %xmm3 1336; SSE-NEXT: retq 1337; 1338; AVX1-LABEL: test_ext_v16i32: 1339; AVX1: # %bb.0: 1340; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm4 1341; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 1342; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 1343; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1344; AVX1-NEXT: vpsrad $1, %xmm2, %xmm2 1345; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 1346; AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 1347; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 1348; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1349; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm2 1350; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1351; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 1352; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1353; AVX1-NEXT: vpsrad $1, %xmm3, %xmm3 1354; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 1355; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1 1356; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 1357; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1358; AVX1-NEXT: retq 1359; 1360; AVX2-LABEL: test_ext_v16i32: 1361; AVX2: # %bb.0: 1362; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm4 1363; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 1364; AVX2-NEXT: vpsrad $1, %ymm0, %ymm0 1365; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0 1366; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm2 1367; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 1368; AVX2-NEXT: vpsrad $1, %ymm1, %ymm1 1369; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 1370; AVX2-NEXT: retq 1371; 1372; AVX512-LABEL: test_ext_v16i32: 1373; AVX512: # %bb.0: 1374; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm2 1375; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 1376; AVX512-NEXT: vpsrad $1, %zmm0, %zmm0 1377; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 1378; AVX512-NEXT: retq 1379 %x0 = sext <16 x i32> %a0 to <16 x i64> 1380 %x1 = sext <16 x i32> %a1 to <16 x i64> 1381 %sum = add <16 x i64> %x0, %x1 1382 %shift = ashr <16 x i64> %sum, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 1383 %res = trunc <16 x i64> %shift to <16 x i32> 1384 ret <16 x i32> %res 1385} 1386 1387define <8 x i64> @test_fixed_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { 1388; SSE2-LABEL: test_fixed_v8i64: 1389; SSE2: # %bb.0: 1390; SSE2-NEXT: movdqa %xmm0, %xmm8 1391; SSE2-NEXT: pxor %xmm4, %xmm8 1392; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,3,2,3] 1393; SSE2-NEXT: psrad $1, %xmm9 1394; SSE2-NEXT: psrlq $1, %xmm8 1395; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 1396; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] 1397; SSE2-NEXT: pand %xmm4, %xmm0 1398; SSE2-NEXT: paddq %xmm8, %xmm0 1399; SSE2-NEXT: movdqa %xmm1, %xmm4 1400; SSE2-NEXT: pxor %xmm5, %xmm4 1401; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,3,2,3] 1402; SSE2-NEXT: psrad $1, %xmm8 1403; SSE2-NEXT: psrlq $1, %xmm4 1404; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1405; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] 1406; SSE2-NEXT: pand %xmm5, %xmm1 1407; SSE2-NEXT: paddq %xmm4, %xmm1 1408; SSE2-NEXT: movdqa %xmm2, %xmm4 1409; SSE2-NEXT: pxor %xmm6, %xmm4 1410; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] 1411; SSE2-NEXT: psrad $1, %xmm5 1412; SSE2-NEXT: psrlq $1, %xmm4 1413; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1414; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1415; SSE2-NEXT: pand %xmm6, %xmm2 1416; SSE2-NEXT: paddq %xmm4, %xmm2 1417; SSE2-NEXT: movdqa %xmm3, %xmm4 1418; SSE2-NEXT: pxor %xmm7, %xmm4 1419; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] 1420; SSE2-NEXT: psrad $1, %xmm5 1421; SSE2-NEXT: psrlq $1, %xmm4 1422; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1423; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1424; SSE2-NEXT: pand %xmm7, %xmm3 1425; SSE2-NEXT: paddq %xmm4, %xmm3 1426; SSE2-NEXT: retq 1427; 1428; SSE4-LABEL: test_fixed_v8i64: 1429; SSE4: # %bb.0: 1430; SSE4-NEXT: movdqa %xmm0, %xmm8 1431; SSE4-NEXT: pxor %xmm4, %xmm8 1432; SSE4-NEXT: movdqa %xmm8, %xmm9 1433; SSE4-NEXT: psrad $1, %xmm9 1434; SSE4-NEXT: psrlq $1, %xmm8 1435; SSE4-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7] 1436; SSE4-NEXT: pand %xmm4, %xmm0 1437; SSE4-NEXT: paddq %xmm8, %xmm0 1438; SSE4-NEXT: movdqa %xmm1, %xmm4 1439; SSE4-NEXT: pxor %xmm5, %xmm4 1440; SSE4-NEXT: movdqa %xmm4, %xmm8 1441; SSE4-NEXT: psrad $1, %xmm8 1442; SSE4-NEXT: psrlq $1, %xmm4 1443; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3],xmm4[4,5],xmm8[6,7] 1444; SSE4-NEXT: pand %xmm5, %xmm1 1445; SSE4-NEXT: paddq %xmm4, %xmm1 1446; SSE4-NEXT: movdqa %xmm2, %xmm4 1447; SSE4-NEXT: pxor %xmm6, %xmm4 1448; SSE4-NEXT: movdqa %xmm4, %xmm5 1449; SSE4-NEXT: psrad $1, %xmm5 1450; SSE4-NEXT: psrlq $1, %xmm4 1451; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1452; SSE4-NEXT: pand %xmm6, %xmm2 1453; SSE4-NEXT: paddq %xmm4, %xmm2 1454; SSE4-NEXT: movdqa %xmm3, %xmm4 1455; SSE4-NEXT: pxor %xmm7, %xmm4 1456; SSE4-NEXT: movdqa %xmm4, %xmm5 1457; SSE4-NEXT: psrad $1, %xmm5 1458; SSE4-NEXT: psrlq $1, %xmm4 1459; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1460; SSE4-NEXT: pand %xmm7, %xmm3 1461; SSE4-NEXT: paddq %xmm4, %xmm3 1462; SSE4-NEXT: retq 1463; 1464; AVX1-LABEL: test_fixed_v8i64: 1465; AVX1: # %bb.0: 1466; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm4 1467; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 1468; AVX1-NEXT: vpsrad $1, %xmm5, %xmm6 1469; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 1470; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1471; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1472; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1473; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 1474; AVX1-NEXT: vpsrad $1, %xmm4, %xmm5 1475; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm4 1476; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1477; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 1478; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1479; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm2 1480; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1481; AVX1-NEXT: vpsrad $1, %xmm4, %xmm5 1482; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm4 1483; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1484; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 1485; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1486; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 1487; AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 1488; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm2 1489; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1490; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 1491; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1492; AVX1-NEXT: retq 1493; 1494; AVX2-LABEL: test_fixed_v8i64: 1495; AVX2: # %bb.0: 1496; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 1497; AVX2-NEXT: vpsrad $1, %ymm4, %ymm5 1498; AVX2-NEXT: vpsrlq $1, %ymm4, %ymm4 1499; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] 1500; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1501; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 1502; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm2 1503; AVX2-NEXT: vpsrad $1, %ymm2, %ymm4 1504; AVX2-NEXT: vpsrlq $1, %ymm2, %ymm2 1505; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] 1506; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1507; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 1508; AVX2-NEXT: retq 1509; 1510; AVX512-LABEL: test_fixed_v8i64: 1511; AVX512: # %bb.0: 1512; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2 1513; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 1514; AVX512-NEXT: vpsraq $1, %zmm0, %zmm0 1515; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0 1516; AVX512-NEXT: retq 1517 %and = and <8 x i64> %a0, %a1 1518 %xor = xor <8 x i64> %a1, %a0 1519 %shift = ashr <8 x i64> %xor, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 1520 %res = add <8 x i64> %and, %shift 1521 ret <8 x i64> %res 1522} 1523 1524define <8 x i64> @test_ext_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { 1525; SSE2-LABEL: test_ext_v8i64: 1526; SSE2: # %bb.0: 1527; SSE2-NEXT: movdqa %xmm0, %xmm8 1528; SSE2-NEXT: pxor %xmm4, %xmm8 1529; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[1,3,2,3] 1530; SSE2-NEXT: psrad $1, %xmm9 1531; SSE2-NEXT: psrlq $1, %xmm8 1532; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 1533; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] 1534; SSE2-NEXT: pand %xmm4, %xmm0 1535; SSE2-NEXT: paddq %xmm8, %xmm0 1536; SSE2-NEXT: movdqa %xmm1, %xmm4 1537; SSE2-NEXT: pxor %xmm5, %xmm4 1538; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,3,2,3] 1539; SSE2-NEXT: psrad $1, %xmm8 1540; SSE2-NEXT: psrlq $1, %xmm4 1541; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1542; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] 1543; SSE2-NEXT: pand %xmm5, %xmm1 1544; SSE2-NEXT: paddq %xmm4, %xmm1 1545; SSE2-NEXT: movdqa %xmm2, %xmm4 1546; SSE2-NEXT: pxor %xmm6, %xmm4 1547; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] 1548; SSE2-NEXT: psrad $1, %xmm5 1549; SSE2-NEXT: psrlq $1, %xmm4 1550; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1551; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1552; SSE2-NEXT: pand %xmm6, %xmm2 1553; SSE2-NEXT: paddq %xmm4, %xmm2 1554; SSE2-NEXT: movdqa %xmm3, %xmm4 1555; SSE2-NEXT: pxor %xmm7, %xmm4 1556; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,3,2,3] 1557; SSE2-NEXT: psrad $1, %xmm5 1558; SSE2-NEXT: psrlq $1, %xmm4 1559; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1560; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1561; SSE2-NEXT: pand %xmm7, %xmm3 1562; SSE2-NEXT: paddq %xmm4, %xmm3 1563; SSE2-NEXT: retq 1564; 1565; SSE4-LABEL: test_ext_v8i64: 1566; SSE4: # %bb.0: 1567; SSE4-NEXT: movdqa %xmm0, %xmm8 1568; SSE4-NEXT: pxor %xmm4, %xmm8 1569; SSE4-NEXT: movdqa %xmm8, %xmm9 1570; SSE4-NEXT: psrad $1, %xmm9 1571; SSE4-NEXT: psrlq $1, %xmm8 1572; SSE4-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7] 1573; SSE4-NEXT: pand %xmm4, %xmm0 1574; SSE4-NEXT: paddq %xmm8, %xmm0 1575; SSE4-NEXT: movdqa %xmm1, %xmm4 1576; SSE4-NEXT: pxor %xmm5, %xmm4 1577; SSE4-NEXT: movdqa %xmm4, %xmm8 1578; SSE4-NEXT: psrad $1, %xmm8 1579; SSE4-NEXT: psrlq $1, %xmm4 1580; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3],xmm4[4,5],xmm8[6,7] 1581; SSE4-NEXT: pand %xmm5, %xmm1 1582; SSE4-NEXT: paddq %xmm4, %xmm1 1583; SSE4-NEXT: movdqa %xmm2, %xmm4 1584; SSE4-NEXT: pxor %xmm6, %xmm4 1585; SSE4-NEXT: movdqa %xmm4, %xmm5 1586; SSE4-NEXT: psrad $1, %xmm5 1587; SSE4-NEXT: psrlq $1, %xmm4 1588; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1589; SSE4-NEXT: pand %xmm6, %xmm2 1590; SSE4-NEXT: paddq %xmm4, %xmm2 1591; SSE4-NEXT: movdqa %xmm3, %xmm4 1592; SSE4-NEXT: pxor %xmm7, %xmm4 1593; SSE4-NEXT: movdqa %xmm4, %xmm5 1594; SSE4-NEXT: psrad $1, %xmm5 1595; SSE4-NEXT: psrlq $1, %xmm4 1596; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1597; SSE4-NEXT: pand %xmm7, %xmm3 1598; SSE4-NEXT: paddq %xmm4, %xmm3 1599; SSE4-NEXT: retq 1600; 1601; AVX1-LABEL: test_ext_v8i64: 1602; AVX1: # %bb.0: 1603; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm4 1604; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 1605; AVX1-NEXT: vpsrad $1, %xmm5, %xmm6 1606; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm5 1607; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1608; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1609; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1610; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 1611; AVX1-NEXT: vpsrad $1, %xmm4, %xmm5 1612; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm4 1613; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1614; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 1615; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1616; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm2 1617; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 1618; AVX1-NEXT: vpsrad $1, %xmm4, %xmm5 1619; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm4 1620; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 1621; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 1622; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1623; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 1624; AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 1625; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm2 1626; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1627; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 1628; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1629; AVX1-NEXT: retq 1630; 1631; AVX2-LABEL: test_ext_v8i64: 1632; AVX2: # %bb.0: 1633; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 1634; AVX2-NEXT: vpsrad $1, %ymm4, %ymm5 1635; AVX2-NEXT: vpsrlq $1, %ymm4, %ymm4 1636; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] 1637; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1638; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 1639; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm2 1640; AVX2-NEXT: vpsrad $1, %ymm2, %ymm4 1641; AVX2-NEXT: vpsrlq $1, %ymm2, %ymm2 1642; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3],ymm2[4],ymm4[5],ymm2[6],ymm4[7] 1643; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1644; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 1645; AVX2-NEXT: retq 1646; 1647; AVX512-LABEL: test_ext_v8i64: 1648; AVX512: # %bb.0: 1649; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm2 1650; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 1651; AVX512-NEXT: vpsraq $1, %zmm0, %zmm0 1652; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0 1653; AVX512-NEXT: retq 1654 %x0 = sext <8 x i64> %a0 to <8 x i128> 1655 %x1 = sext <8 x i64> %a1 to <8 x i128> 1656 %sum = add <8 x i128> %x0, %x1 1657 %shift = ashr <8 x i128> %sum, <i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1, i128 1> 1658 %res = trunc <8 x i128> %shift to <8 x i64> 1659 ret <8 x i64> %res 1660} 1661 1662