1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=ANY,AVX1 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=ANY,INT256 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=ANY,INT256 5 6define <4 x double> @andpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { 7; ANY-LABEL: andpd256: 8; ANY: # %bb.0: # %entry 9; ANY-NEXT: vandpd %ymm0, %ymm1, %ymm0 10; ANY-NEXT: vxorpd %xmm1, %xmm1, %xmm1 11; ANY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 12; ANY-NEXT: retq 13entry: 14 %0 = bitcast <4 x double> %x to <4 x i64> 15 %1 = bitcast <4 x double> %y to <4 x i64> 16 %and.i = and <4 x i64> %0, %1 17 %2 = bitcast <4 x i64> %and.i to <4 x double> 18 ; add forces execution domain 19 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 20 ret <4 x double> %3 21} 22 23define <4 x double> @andpd256fold(<4 x double> %y) nounwind uwtable readnone ssp { 24; ANY-LABEL: andpd256fold: 25; ANY: # %bb.0: # %entry 26; ANY-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 27; ANY-NEXT: vxorpd %xmm1, %xmm1, %xmm1 28; ANY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 29; ANY-NEXT: retq 30entry: 31 %0 = bitcast <4 x double> %y to <4 x i64> 32 %and.i = and <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507> 33 %1 = bitcast <4 x i64> %and.i to <4 x double> 34 ; add forces execution domain 35 %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0> 36 ret <4 x double> %2 37} 38 39define <8 x float> @andps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { 40; ANY-LABEL: andps256: 41; ANY: # %bb.0: # %entry 42; ANY-NEXT: vandps %ymm0, %ymm1, %ymm0 43; ANY-NEXT: retq 44entry: 45 %0 = bitcast <8 x float> %x to <8 x i32> 46 %1 = bitcast <8 x float> %y to <8 x i32> 47 %and.i = and <8 x i32> %0, %1 48 %2 = bitcast <8 x i32> %and.i to <8 x float> 49 ret <8 x float> %2 50} 51 52define <8 x float> @andps256fold(<8 x float> %y) nounwind uwtable readnone ssp { 53; ANY-LABEL: andps256fold: 54; ANY: # %bb.0: # %entry 55; ANY-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 56; ANY-NEXT: retq 57entry: 58 %0 = bitcast <8 x float> %y to <8 x i32> 59 %and.i = and <8 x i32> %0, <i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938, i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938> 60 %1 = bitcast <8 x i32> %and.i to <8 x float> 61 ret <8 x float> %1 62} 63 64define <4 x double> @xorpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { 65; ANY-LABEL: xorpd256: 66; ANY: # %bb.0: # %entry 67; ANY-NEXT: vxorpd %ymm0, %ymm1, %ymm0 68; ANY-NEXT: vxorpd %xmm1, %xmm1, %xmm1 69; ANY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 70; ANY-NEXT: retq 71entry: 72 %0 = bitcast <4 x double> %x to <4 x i64> 73 %1 = bitcast <4 x double> %y to <4 x i64> 74 %xor.i = xor <4 x i64> %0, %1 75 %2 = bitcast <4 x i64> %xor.i to <4 x double> 76 ; add forces execution domain 77 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 78 ret <4 x double> %3 79} 80 81define <4 x double> @xorpd256fold(<4 x double> %y) nounwind uwtable readnone ssp { 82; ANY-LABEL: xorpd256fold: 83; ANY: # %bb.0: # %entry 84; ANY-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 85; ANY-NEXT: vxorpd %xmm1, %xmm1, %xmm1 86; ANY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 87; ANY-NEXT: retq 88entry: 89 %0 = bitcast <4 x double> %y to <4 x i64> 90 %xor.i = xor <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507> 91 %1 = bitcast <4 x i64> %xor.i to <4 x double> 92 ; add forces execution domain 93 %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0> 94 ret <4 x double> %2 95} 96 97define <8 x float> @xorps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { 98; ANY-LABEL: xorps256: 99; ANY: # %bb.0: # %entry 100; ANY-NEXT: vxorps %ymm0, %ymm1, %ymm0 101; ANY-NEXT: retq 102entry: 103 %0 = bitcast <8 x float> %x to <8 x i32> 104 %1 = bitcast <8 x float> %y to <8 x i32> 105 %xor.i = xor <8 x i32> %0, %1 106 %2 = bitcast <8 x i32> %xor.i to <8 x float> 107 ret <8 x float> %2 108} 109 110define <8 x float> @xorps256fold(<8 x float> %y) nounwind uwtable readnone ssp { 111; ANY-LABEL: xorps256fold: 112; ANY: # %bb.0: # %entry 113; ANY-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 114; ANY-NEXT: retq 115entry: 116 %0 = bitcast <8 x float> %y to <8 x i32> 117 %xor.i = xor <8 x i32> %0, <i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938, i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938> 118 %1 = bitcast <8 x i32> %xor.i to <8 x float> 119 ret <8 x float> %1 120} 121 122define <4 x double> @orpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { 123; ANY-LABEL: orpd256: 124; ANY: # %bb.0: # %entry 125; ANY-NEXT: vorpd %ymm0, %ymm1, %ymm0 126; ANY-NEXT: vxorpd %xmm1, %xmm1, %xmm1 127; ANY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 128; ANY-NEXT: retq 129entry: 130 %0 = bitcast <4 x double> %x to <4 x i64> 131 %1 = bitcast <4 x double> %y to <4 x i64> 132 %or.i = or <4 x i64> %0, %1 133 %2 = bitcast <4 x i64> %or.i to <4 x double> 134 ; add forces execution domain 135 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 136 ret <4 x double> %3 137} 138 139define <4 x double> @orpd256fold(<4 x double> %y) nounwind uwtable readnone ssp { 140; ANY-LABEL: orpd256fold: 141; ANY: # %bb.0: # %entry 142; ANY-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 143; ANY-NEXT: vxorpd %xmm1, %xmm1, %xmm1 144; ANY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 145; ANY-NEXT: retq 146entry: 147 %0 = bitcast <4 x double> %y to <4 x i64> 148 %or.i = or <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507> 149 %1 = bitcast <4 x i64> %or.i to <4 x double> 150 ; add forces execution domain 151 %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0> 152 ret <4 x double> %2 153} 154 155define <8 x float> @orps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { 156; ANY-LABEL: orps256: 157; ANY: # %bb.0: # %entry 158; ANY-NEXT: vorps %ymm0, %ymm1, %ymm0 159; ANY-NEXT: retq 160entry: 161 %0 = bitcast <8 x float> %x to <8 x i32> 162 %1 = bitcast <8 x float> %y to <8 x i32> 163 %or.i = or <8 x i32> %0, %1 164 %2 = bitcast <8 x i32> %or.i to <8 x float> 165 ret <8 x float> %2 166} 167 168define <8 x float> @orps256fold(<8 x float> %y) nounwind uwtable readnone ssp { 169; ANY-LABEL: orps256fold: 170; ANY: # %bb.0: # %entry 171; ANY-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 172; ANY-NEXT: retq 173entry: 174 %0 = bitcast <8 x float> %y to <8 x i32> 175 %or.i = or <8 x i32> %0, <i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938, i32 1083179008, i32 1079613850, i32 1075000115, i32 1067030938> 176 %1 = bitcast <8 x i32> %or.i to <8 x float> 177 ret <8 x float> %1 178} 179 180define <4 x double> @andnotpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { 181; ANY-LABEL: andnotpd256: 182; ANY: # %bb.0: # %entry 183; ANY-NEXT: vandnpd %ymm0, %ymm1, %ymm0 184; ANY-NEXT: vxorpd %xmm1, %xmm1, %xmm1 185; ANY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 186; ANY-NEXT: retq 187entry: 188 %0 = bitcast <4 x double> %x to <4 x i64> 189 %neg.i = xor <4 x i64> %0, <i64 -1, i64 -1, i64 -1, i64 -1> 190 %1 = bitcast <4 x double> %y to <4 x i64> 191 %and.i = and <4 x i64> %1, %neg.i 192 %2 = bitcast <4 x i64> %and.i to <4 x double> 193 ; add forces execution domain 194 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 195 ret <4 x double> %3 196} 197 198define <4 x double> @andnotpd256fold(<4 x double> %y, ptr nocapture %x) nounwind uwtable readonly ssp { 199; ANY-LABEL: andnotpd256fold: 200; ANY: # %bb.0: # %entry 201; ANY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 202; ANY-NEXT: vxorpd %xmm1, %xmm1, %xmm1 203; ANY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 204; ANY-NEXT: retq 205entry: 206 %tmp2 = load <4 x double>, ptr %x, align 32 207 %0 = bitcast <4 x double> %y to <4 x i64> 208 %neg.i = xor <4 x i64> %0, <i64 -1, i64 -1, i64 -1, i64 -1> 209 %1 = bitcast <4 x double> %tmp2 to <4 x i64> 210 %and.i = and <4 x i64> %1, %neg.i 211 %2 = bitcast <4 x i64> %and.i to <4 x double> 212 ; add forces execution domain 213 %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0> 214 ret <4 x double> %3 215} 216 217define <8 x float> @andnotps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp { 218; ANY-LABEL: andnotps256: 219; ANY: # %bb.0: # %entry 220; ANY-NEXT: vandnps %ymm0, %ymm1, %ymm0 221; ANY-NEXT: retq 222entry: 223 %0 = bitcast <8 x float> %x to <8 x i32> 224 %neg.i = xor <8 x i32> %0, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 225 %1 = bitcast <8 x float> %y to <8 x i32> 226 %and.i = and <8 x i32> %1, %neg.i 227 %2 = bitcast <8 x i32> %and.i to <8 x float> 228 ret <8 x float> %2 229} 230 231define <8 x float> @andnotps256fold(<8 x float> %y, ptr nocapture %x) nounwind uwtable readonly ssp { 232; ANY-LABEL: andnotps256fold: 233; ANY: # %bb.0: # %entry 234; ANY-NEXT: vandnps (%rdi), %ymm0, %ymm0 235; ANY-NEXT: retq 236entry: 237 %tmp2 = load <8 x float>, ptr %x, align 32 238 %0 = bitcast <8 x float> %y to <8 x i32> 239 %neg.i = xor <8 x i32> %0, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 240 %1 = bitcast <8 x float> %tmp2 to <8 x i32> 241 %and.i = and <8 x i32> %1, %neg.i 242 %2 = bitcast <8 x i32> %and.i to <8 x float> 243 ret <8 x float> %2 244} 245 246;;; Test that basic 2 x i64 logic use the integer version on AVX 247 248define <2 x i64> @vpandn(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { 249 ; Force the execution domain with an add. 250; ANY-LABEL: vpandn: 251; ANY: # %bb.0: 252; ANY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 253; ANY-NEXT: vpsubq %xmm1, %xmm0, %xmm1 254; ANY-NEXT: vpandn %xmm0, %xmm1, %xmm0 255; ANY-NEXT: retq 256 %a2 = add <2 x i64> %a, <i64 1, i64 1> 257 %y = xor <2 x i64> %a2, <i64 -1, i64 -1> 258 %x = and <2 x i64> %a, %y 259 ret <2 x i64> %x 260} 261 262define <2 x i64> @vpand(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { 263 ; Force the execution domain with an add. 264; ANY-LABEL: vpand: 265; ANY: # %bb.0: 266; ANY-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 267; ANY-NEXT: vpsubq %xmm2, %xmm0, %xmm0 268; ANY-NEXT: vpand %xmm1, %xmm0, %xmm0 269; ANY-NEXT: retq 270 %a2 = add <2 x i64> %a, <i64 1, i64 1> 271 %x = and <2 x i64> %a2, %b 272 ret <2 x i64> %x 273} 274 275define <4 x i32> @and_xor_splat1_v4i32(<4 x i32> %x) nounwind { 276; AVX1-LABEL: and_xor_splat1_v4i32: 277; AVX1: # %bb.0: 278; AVX1-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 279; AVX1-NEXT: retq 280; 281; INT256-LABEL: and_xor_splat1_v4i32: 282; INT256: # %bb.0: 283; INT256-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] 284; INT256-NEXT: vandnps %xmm1, %xmm0, %xmm0 285; INT256-NEXT: retq 286 %xor = xor <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 287 %and = and <4 x i32> %xor, <i32 1, i32 1, i32 1, i32 1> 288 ret <4 x i32> %and 289} 290 291define <4 x i64> @and_xor_splat1_v4i64(<4 x i64> %x) nounwind { 292; AVX1-LABEL: and_xor_splat1_v4i64: 293; AVX1: # %bb.0: 294; AVX1-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 295; AVX1-NEXT: retq 296; 297; INT256-LABEL: and_xor_splat1_v4i64: 298; INT256: # %bb.0: 299; INT256-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] 300; INT256-NEXT: vandnps %ymm1, %ymm0, %ymm0 301; INT256-NEXT: retq 302 %xor = xor <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1> 303 %and = and <4 x i64> %xor, <i64 1, i64 1, i64 1, i64 1> 304 ret <4 x i64> %and 305} 306 307; PR37749 - https://bugs.llvm.org/show_bug.cgi?id=37749 308; For AVX1, we don't want a 256-bit logic op with insert/extract to the surrounding 128-bit ops. 309 310define <8 x i32> @and_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { 311; AVX1-LABEL: and_disguised_i8_elts: 312; AVX1: # %bb.0: 313; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm3 314; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 315; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 316; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 317; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] 318; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 319; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 320; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 321; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 322; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 323; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 324; AVX1-NEXT: retq 325; 326; INT256-LABEL: and_disguised_i8_elts: 327; INT256: # %bb.0: 328; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 329; INT256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 330; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 331; INT256-NEXT: retq 332 %a = add <8 x i32> %x, %y 333 %l = and <8 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 334 %t = add <8 x i32> %l, %z 335 ret <8 x i32> %t 336} 337 338define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { 339; AVX1-LABEL: andn_disguised_i8_elts: 340; AVX1: # %bb.0: 341; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm3 342; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 343; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 344; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 345; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] 346; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 347; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 348; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 349; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1 350; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 351; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 352; AVX1-NEXT: retq 353; 354; INT256-LABEL: andn_disguised_i8_elts: 355; INT256: # %bb.0: 356; INT256-NEXT: vpaddd %ymm0, %ymm1, %ymm0 357; INT256-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 358; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 359; INT256-NEXT: retq 360 %add = add <8 x i32> %y, %x 361 %neg = and <8 x i32> %add, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 362 %and = xor <8 x i32> %neg, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 363 %add1 = add <8 x i32> %and, %z 364 ret <8 x i32> %add1 365} 366 367; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable. 368 369define <8 x i32> @andn_variable_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { 370; AVX1-LABEL: andn_variable_mask_operand_no_concat: 371; AVX1: # %bb.0: 372; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm0 373; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 374; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 375; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 376; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 377; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 378; AVX1-NEXT: retq 379; 380; INT256-LABEL: andn_variable_mask_operand_no_concat: 381; INT256: # %bb.0: 382; INT256-NEXT: vpandn %ymm2, %ymm0, %ymm0 383; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 384; INT256-NEXT: retq 385 %and = and <8 x i32> %x, %z 386 %xor = xor <8 x i32> %and, %z ; demanded bits will make this a 'not' 387 %add = add <8 x i32> %xor, %y 388 ret <8 x i32> %add 389} 390 391; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable (even if the mask is a constant). 392 393define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y) { 394; AVX1-LABEL: andn_constant_mask_operand_no_concat: 395; AVX1: # %bb.0: 396; AVX1-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 397; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 398; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 399; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 400; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 401; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 402; AVX1-NEXT: retq 403; 404; INT256-LABEL: andn_constant_mask_operand_no_concat: 405; INT256: # %bb.0: 406; INT256-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 407; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 408; INT256-NEXT: retq 409 %xor = xor <8 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 410 %and = and <8 x i32> %xor, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 411 %r = add <8 x i32> %and, %y 412 ret <8 x i32> %r 413} 414 415; This is a close call, but we split the 'andn' to reduce the insert/extract. 416 417define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) { 418; AVX1-LABEL: andn_variable_mask_operand_concat: 419; AVX1: # %bb.0: 420; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4 421; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 422; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 423; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 424; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 425; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 426; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 427; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 428; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm1 429; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 430; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 431; AVX1-NEXT: retq 432; 433; INT256-LABEL: andn_variable_mask_operand_concat: 434; INT256: # %bb.0: 435; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 436; INT256-NEXT: vpandn %ymm2, %ymm0, %ymm0 437; INT256-NEXT: vpaddd %ymm3, %ymm0, %ymm0 438; INT256-NEXT: retq 439 %add = add <8 x i32> %x, %y 440 %xor = xor <8 x i32> %add, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 441 %and = and <8 x i32> %xor, %z 442 %r = add <8 x i32> %and, %w 443 ret <8 x i32> %r 444} 445 446define <8 x i32> @or_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { 447; AVX1-LABEL: or_disguised_i8_elts: 448; AVX1: # %bb.0: 449; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm3 450; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 451; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 452; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 453; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] 454; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 455; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 456; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 457; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 458; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 459; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 460; AVX1-NEXT: retq 461; 462; INT256-LABEL: or_disguised_i8_elts: 463; INT256: # %bb.0: 464; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 465; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] 466; INT256-NEXT: vpor %ymm1, %ymm0, %ymm0 467; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 468; INT256-NEXT: retq 469 %a = add <8 x i32> %x, %y 470 %l = or <8 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 471 %t = add <8 x i32> %l, %z 472 ret <8 x i32> %t 473} 474 475define <8 x i32> @xor_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { 476; AVX1-LABEL: xor_disguised_i8_elts: 477; AVX1: # %bb.0: 478; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm3 479; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 480; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 481; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 482; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255] 483; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 484; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 485; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 486; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 487; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 488; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 489; AVX1-NEXT: retq 490; 491; INT256-LABEL: xor_disguised_i8_elts: 492; INT256: # %bb.0: 493; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 494; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] 495; INT256-NEXT: vpxor %ymm1, %ymm0, %ymm0 496; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 497; INT256-NEXT: retq 498 %a = add <8 x i32> %x, %y 499 %l = xor <8 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 500 %t = add <8 x i32> %l, %z 501 ret <8 x i32> %t 502} 503 504define <8 x i32> @and_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { 505; AVX1-LABEL: and_disguised_i16_elts: 506; AVX1: # %bb.0: 507; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm3 508; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 509; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 510; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 511; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 512; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 513; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 514; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 515; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] 516; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 517; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 518; AVX1-NEXT: retq 519; 520; INT256-LABEL: and_disguised_i16_elts: 521; INT256: # %bb.0: 522; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 523; INT256-NEXT: vpxor %xmm1, %xmm1, %xmm1 524; INT256-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 525; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 526; INT256-NEXT: retq 527 %a = add <8 x i32> %x, %y 528 %l = and <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 529 %t = add <8 x i32> %l, %z 530 ret <8 x i32> %t 531} 532 533define <8 x i32> @or_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { 534; AVX1-LABEL: or_disguised_i16_elts: 535; AVX1: # %bb.0: 536; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm3 537; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 538; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 539; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 540; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [65535,65535,65535,65535] 541; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 542; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 543; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 544; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 545; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 546; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 547; AVX1-NEXT: retq 548; 549; INT256-LABEL: or_disguised_i16_elts: 550; INT256: # %bb.0: 551; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 552; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] 553; INT256-NEXT: vpor %ymm1, %ymm0, %ymm0 554; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 555; INT256-NEXT: retq 556 %a = add <8 x i32> %x, %y 557 %l = or <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 558 %t = add <8 x i32> %l, %z 559 ret <8 x i32> %t 560} 561 562define <8 x i32> @xor_disguised_i16_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) { 563; AVX1-LABEL: xor_disguised_i16_elts: 564; AVX1: # %bb.0: 565; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm3 566; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 567; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 568; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 569; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [65535,65535,65535,65535] 570; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 571; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 572; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 573; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm1 574; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 575; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 576; AVX1-NEXT: retq 577; 578; INT256-LABEL: xor_disguised_i16_elts: 579; INT256: # %bb.0: 580; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 581; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] 582; INT256-NEXT: vpxor %ymm1, %ymm0, %ymm0 583; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 584; INT256-NEXT: retq 585 %a = add <8 x i32> %x, %y 586 %l = xor <8 x i32> %a, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 587 %t = add <8 x i32> %l, %z 588 ret <8 x i32> %t 589} 590 591