1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 #include <optional> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "x86tti" 26 27 /// Return a constant boolean vector that has true elements in all positions 28 /// where the input constant data vector has an element with the sign bit set. 29 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) { 30 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 31 V = ConstantExpr::getBitCast(V, IntTy); 32 V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT, 33 Constant::getNullValue(IntTy), V, DL); 34 assert(V && "Vector must be foldable"); 35 return V; 36 } 37 38 /// Convert the x86 XMM integer vector mask to a vector of bools based on 39 /// each element's most significant bit (the sign bit). 40 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) { 41 // Fold Constant Mask. 42 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 43 return getNegativeIsTrueBoolVec(ConstantMask, DL); 44 45 // Mask was extended from a boolean vector. 46 Value *ExtMask; 47 if (PatternMatch::match( 48 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 49 ExtMask->getType()->isIntOrIntVectorTy(1)) 50 return ExtMask; 51 52 return nullptr; 53 } 54 55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 56 // XMM register mask efficiently, we could transform all x86 masked intrinsics 57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 59 Value *Ptr = II.getOperand(0); 60 Value *Mask = II.getOperand(1); 61 Constant *ZeroVec = Constant::getNullValue(II.getType()); 62 63 // Zero Mask - masked load instruction creates a zero vector. 64 if (isa<ConstantAggregateZero>(Mask)) 65 return IC.replaceInstUsesWith(II, ZeroVec); 66 67 // The mask is constant or extended from a bool vector. Convert this x86 68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 70 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 71 // the LLVM intrinsic definition for the pointer argument. 72 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 73 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 74 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 75 76 // The pass-through vector for an x86 masked load is a zero vector. 77 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 78 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 79 return IC.replaceInstUsesWith(II, NewMaskedLoad); 80 } 81 82 return nullptr; 83 } 84 85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 86 // XMM register mask efficiently, we could transform all x86 masked intrinsics 87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 89 Value *Ptr = II.getOperand(0); 90 Value *Mask = II.getOperand(1); 91 Value *Vec = II.getOperand(2); 92 93 // Zero Mask - this masked store instruction does nothing. 94 if (isa<ConstantAggregateZero>(Mask)) { 95 IC.eraseInstFromFunction(II); 96 return true; 97 } 98 99 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 100 // anything else at this level. 101 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 102 return false; 103 104 // The mask is constant or extended from a bool vector. Convert this x86 105 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 106 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 107 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 108 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 109 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 110 111 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 112 113 // 'Replace uses' doesn't work for stores. Erase the original masked store. 114 IC.eraseInstFromFunction(II); 115 return true; 116 } 117 118 return false; 119 } 120 121 static Value *simplifyX86immShift(const IntrinsicInst &II, 122 InstCombiner::BuilderTy &Builder) { 123 bool LogicalShift = false; 124 bool ShiftLeft = false; 125 bool IsImm = false; 126 127 switch (II.getIntrinsicID()) { 128 default: 129 llvm_unreachable("Unexpected intrinsic!"); 130 case Intrinsic::x86_sse2_psrai_d: 131 case Intrinsic::x86_sse2_psrai_w: 132 case Intrinsic::x86_avx2_psrai_d: 133 case Intrinsic::x86_avx2_psrai_w: 134 case Intrinsic::x86_avx512_psrai_q_128: 135 case Intrinsic::x86_avx512_psrai_q_256: 136 case Intrinsic::x86_avx512_psrai_d_512: 137 case Intrinsic::x86_avx512_psrai_q_512: 138 case Intrinsic::x86_avx512_psrai_w_512: 139 IsImm = true; 140 [[fallthrough]]; 141 case Intrinsic::x86_sse2_psra_d: 142 case Intrinsic::x86_sse2_psra_w: 143 case Intrinsic::x86_avx2_psra_d: 144 case Intrinsic::x86_avx2_psra_w: 145 case Intrinsic::x86_avx512_psra_q_128: 146 case Intrinsic::x86_avx512_psra_q_256: 147 case Intrinsic::x86_avx512_psra_d_512: 148 case Intrinsic::x86_avx512_psra_q_512: 149 case Intrinsic::x86_avx512_psra_w_512: 150 LogicalShift = false; 151 ShiftLeft = false; 152 break; 153 case Intrinsic::x86_sse2_psrli_d: 154 case Intrinsic::x86_sse2_psrli_q: 155 case Intrinsic::x86_sse2_psrli_w: 156 case Intrinsic::x86_avx2_psrli_d: 157 case Intrinsic::x86_avx2_psrli_q: 158 case Intrinsic::x86_avx2_psrli_w: 159 case Intrinsic::x86_avx512_psrli_d_512: 160 case Intrinsic::x86_avx512_psrli_q_512: 161 case Intrinsic::x86_avx512_psrli_w_512: 162 IsImm = true; 163 [[fallthrough]]; 164 case Intrinsic::x86_sse2_psrl_d: 165 case Intrinsic::x86_sse2_psrl_q: 166 case Intrinsic::x86_sse2_psrl_w: 167 case Intrinsic::x86_avx2_psrl_d: 168 case Intrinsic::x86_avx2_psrl_q: 169 case Intrinsic::x86_avx2_psrl_w: 170 case Intrinsic::x86_avx512_psrl_d_512: 171 case Intrinsic::x86_avx512_psrl_q_512: 172 case Intrinsic::x86_avx512_psrl_w_512: 173 LogicalShift = true; 174 ShiftLeft = false; 175 break; 176 case Intrinsic::x86_sse2_pslli_d: 177 case Intrinsic::x86_sse2_pslli_q: 178 case Intrinsic::x86_sse2_pslli_w: 179 case Intrinsic::x86_avx2_pslli_d: 180 case Intrinsic::x86_avx2_pslli_q: 181 case Intrinsic::x86_avx2_pslli_w: 182 case Intrinsic::x86_avx512_pslli_d_512: 183 case Intrinsic::x86_avx512_pslli_q_512: 184 case Intrinsic::x86_avx512_pslli_w_512: 185 IsImm = true; 186 [[fallthrough]]; 187 case Intrinsic::x86_sse2_psll_d: 188 case Intrinsic::x86_sse2_psll_q: 189 case Intrinsic::x86_sse2_psll_w: 190 case Intrinsic::x86_avx2_psll_d: 191 case Intrinsic::x86_avx2_psll_q: 192 case Intrinsic::x86_avx2_psll_w: 193 case Intrinsic::x86_avx512_psll_d_512: 194 case Intrinsic::x86_avx512_psll_q_512: 195 case Intrinsic::x86_avx512_psll_w_512: 196 LogicalShift = true; 197 ShiftLeft = true; 198 break; 199 } 200 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 201 202 Value *Vec = II.getArgOperand(0); 203 Value *Amt = II.getArgOperand(1); 204 auto *VT = cast<FixedVectorType>(Vec->getType()); 205 Type *SVT = VT->getElementType(); 206 Type *AmtVT = Amt->getType(); 207 unsigned VWidth = VT->getNumElements(); 208 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 209 210 // If the shift amount is guaranteed to be in-range we can replace it with a 211 // generic shift. If its guaranteed to be out of range, logical shifts combine 212 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 213 if (IsImm) { 214 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 215 KnownBits KnownAmtBits = 216 llvm::computeKnownBits(Amt, II.getDataLayout()); 217 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 218 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 219 Amt = Builder.CreateVectorSplat(VWidth, Amt); 220 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 221 : Builder.CreateLShr(Vec, Amt)) 222 : Builder.CreateAShr(Vec, Amt)); 223 } 224 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 225 if (LogicalShift) 226 return ConstantAggregateZero::get(VT); 227 Amt = ConstantInt::get(SVT, BitWidth - 1); 228 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 229 } 230 } else { 231 // Ensure the first element has an in-range value and the rest of the 232 // elements in the bottom 64 bits are zero. 233 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 234 cast<VectorType>(AmtVT)->getElementType() == SVT && 235 "Unexpected shift-by-scalar type"); 236 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 237 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 238 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 239 KnownBits KnownLowerBits = llvm::computeKnownBits( 240 Amt, DemandedLower, II.getDataLayout()); 241 KnownBits KnownUpperBits = llvm::computeKnownBits( 242 Amt, DemandedUpper, II.getDataLayout()); 243 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 244 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 245 SmallVector<int, 16> ZeroSplat(VWidth, 0); 246 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 247 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 248 : Builder.CreateLShr(Vec, Amt)) 249 : Builder.CreateAShr(Vec, Amt)); 250 } 251 } 252 253 // Simplify if count is constant vector. 254 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 255 if (!CDV) 256 return nullptr; 257 258 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 259 // operand to compute the shift amount. 260 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 261 cast<VectorType>(AmtVT)->getElementType() == SVT && 262 "Unexpected shift-by-scalar type"); 263 264 // Concatenate the sub-elements to create the 64-bit value. 265 APInt Count(64, 0); 266 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 267 unsigned SubEltIdx = (NumSubElts - 1) - i; 268 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 269 Count <<= BitWidth; 270 Count |= SubElt->getValue().zextOrTrunc(64); 271 } 272 273 // If shift-by-zero then just return the original value. 274 if (Count.isZero()) 275 return Vec; 276 277 // Handle cases when Shift >= BitWidth. 278 if (Count.uge(BitWidth)) { 279 // If LogicalShift - just return zero. 280 if (LogicalShift) 281 return ConstantAggregateZero::get(VT); 282 283 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 284 Count = APInt(64, BitWidth - 1); 285 } 286 287 // Get a constant vector of the same type as the first operand. 288 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 289 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 290 291 if (ShiftLeft) 292 return Builder.CreateShl(Vec, ShiftVec); 293 294 if (LogicalShift) 295 return Builder.CreateLShr(Vec, ShiftVec); 296 297 return Builder.CreateAShr(Vec, ShiftVec); 298 } 299 300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 303 static Value *simplifyX86varShift(const IntrinsicInst &II, 304 InstCombiner::BuilderTy &Builder) { 305 bool LogicalShift = false; 306 bool ShiftLeft = false; 307 308 switch (II.getIntrinsicID()) { 309 default: 310 llvm_unreachable("Unexpected intrinsic!"); 311 case Intrinsic::x86_avx2_psrav_d: 312 case Intrinsic::x86_avx2_psrav_d_256: 313 case Intrinsic::x86_avx512_psrav_q_128: 314 case Intrinsic::x86_avx512_psrav_q_256: 315 case Intrinsic::x86_avx512_psrav_d_512: 316 case Intrinsic::x86_avx512_psrav_q_512: 317 case Intrinsic::x86_avx512_psrav_w_128: 318 case Intrinsic::x86_avx512_psrav_w_256: 319 case Intrinsic::x86_avx512_psrav_w_512: 320 LogicalShift = false; 321 ShiftLeft = false; 322 break; 323 case Intrinsic::x86_avx2_psrlv_d: 324 case Intrinsic::x86_avx2_psrlv_d_256: 325 case Intrinsic::x86_avx2_psrlv_q: 326 case Intrinsic::x86_avx2_psrlv_q_256: 327 case Intrinsic::x86_avx512_psrlv_d_512: 328 case Intrinsic::x86_avx512_psrlv_q_512: 329 case Intrinsic::x86_avx512_psrlv_w_128: 330 case Intrinsic::x86_avx512_psrlv_w_256: 331 case Intrinsic::x86_avx512_psrlv_w_512: 332 LogicalShift = true; 333 ShiftLeft = false; 334 break; 335 case Intrinsic::x86_avx2_psllv_d: 336 case Intrinsic::x86_avx2_psllv_d_256: 337 case Intrinsic::x86_avx2_psllv_q: 338 case Intrinsic::x86_avx2_psllv_q_256: 339 case Intrinsic::x86_avx512_psllv_d_512: 340 case Intrinsic::x86_avx512_psllv_q_512: 341 case Intrinsic::x86_avx512_psllv_w_128: 342 case Intrinsic::x86_avx512_psllv_w_256: 343 case Intrinsic::x86_avx512_psllv_w_512: 344 LogicalShift = true; 345 ShiftLeft = true; 346 break; 347 } 348 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 349 350 Value *Vec = II.getArgOperand(0); 351 Value *Amt = II.getArgOperand(1); 352 auto *VT = cast<FixedVectorType>(II.getType()); 353 Type *SVT = VT->getElementType(); 354 int NumElts = VT->getNumElements(); 355 int BitWidth = SVT->getIntegerBitWidth(); 356 357 // If the shift amount is guaranteed to be in-range we can replace it with a 358 // generic shift. 359 KnownBits KnownAmt = 360 llvm::computeKnownBits(Amt, II.getDataLayout()); 361 if (KnownAmt.getMaxValue().ult(BitWidth)) { 362 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 363 : Builder.CreateLShr(Vec, Amt)) 364 : Builder.CreateAShr(Vec, Amt)); 365 } 366 367 // Simplify if all shift amounts are constant/undef. 368 auto *CShift = dyn_cast<Constant>(Amt); 369 if (!CShift) 370 return nullptr; 371 372 // Collect each element's shift amount. 373 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 374 bool AnyOutOfRange = false; 375 SmallVector<int, 8> ShiftAmts; 376 for (int I = 0; I < NumElts; ++I) { 377 auto *CElt = CShift->getAggregateElement(I); 378 if (isa_and_nonnull<UndefValue>(CElt)) { 379 ShiftAmts.push_back(-1); 380 continue; 381 } 382 383 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 384 if (!COp) 385 return nullptr; 386 387 // Handle out of range shifts. 388 // If LogicalShift - set to BitWidth (special case). 389 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 390 APInt ShiftVal = COp->getValue(); 391 if (ShiftVal.uge(BitWidth)) { 392 AnyOutOfRange = LogicalShift; 393 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 394 continue; 395 } 396 397 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 398 } 399 400 // If all elements out of range or UNDEF, return vector of zeros/undefs. 401 // ArithmeticShift should only hit this if they are all UNDEF. 402 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 403 if (llvm::all_of(ShiftAmts, OutOfRange)) { 404 SmallVector<Constant *, 8> ConstantVec; 405 for (int Idx : ShiftAmts) { 406 if (Idx < 0) { 407 ConstantVec.push_back(UndefValue::get(SVT)); 408 } else { 409 assert(LogicalShift && "Logical shift expected"); 410 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 411 } 412 } 413 return ConstantVector::get(ConstantVec); 414 } 415 416 // We can't handle only some out of range values with generic logical shifts. 417 if (AnyOutOfRange) 418 return nullptr; 419 420 // Build the shift amount constant vector. 421 SmallVector<Constant *, 8> ShiftVecAmts; 422 for (int Idx : ShiftAmts) { 423 if (Idx < 0) 424 ShiftVecAmts.push_back(UndefValue::get(SVT)); 425 else 426 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 427 } 428 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 429 430 if (ShiftLeft) 431 return Builder.CreateShl(Vec, ShiftVec); 432 433 if (LogicalShift) 434 return Builder.CreateLShr(Vec, ShiftVec); 435 436 return Builder.CreateAShr(Vec, ShiftVec); 437 } 438 439 static Value *simplifyX86pack(IntrinsicInst &II, 440 InstCombiner::BuilderTy &Builder, bool IsSigned) { 441 Value *Arg0 = II.getArgOperand(0); 442 Value *Arg1 = II.getArgOperand(1); 443 Type *ResTy = II.getType(); 444 445 // Fast all undef handling. 446 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 447 return UndefValue::get(ResTy); 448 449 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 450 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 451 unsigned NumSrcElts = ArgTy->getNumElements(); 452 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 453 "Unexpected packing types"); 454 455 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 456 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 457 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 458 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 459 "Unexpected packing types"); 460 461 // Constant folding. 462 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 463 return nullptr; 464 465 // Clamp Values - signed/unsigned both use signed clamp values, but they 466 // differ on the min/max values. 467 APInt MinValue, MaxValue; 468 if (IsSigned) { 469 // PACKSS: Truncate signed value with signed saturation. 470 // Source values less than dst minint are saturated to minint. 471 // Source values greater than dst maxint are saturated to maxint. 472 MinValue = 473 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 474 MaxValue = 475 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 476 } else { 477 // PACKUS: Truncate signed value with unsigned saturation. 478 // Source values less than zero are saturated to zero. 479 // Source values greater than dst maxuint are saturated to maxuint. 480 MinValue = APInt::getZero(SrcScalarSizeInBits); 481 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 482 } 483 484 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 485 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 488 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 489 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 490 491 // Shuffle clamped args together at the lane level. 492 SmallVector<int, 32> PackMask; 493 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 496 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 497 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 498 } 499 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 500 501 // Truncate to dst size. 502 return Builder.CreateTrunc(Shuffle, ResTy); 503 } 504 505 static Value *simplifyX86movmsk(const IntrinsicInst &II, 506 InstCombiner::BuilderTy &Builder) { 507 Value *Arg = II.getArgOperand(0); 508 Type *ResTy = II.getType(); 509 510 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 511 if (isa<UndefValue>(Arg)) 512 return Constant::getNullValue(ResTy); 513 514 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 515 // We can't easily peek through x86_mmx types. 516 if (!ArgTy) 517 return nullptr; 518 519 // Expand MOVMSK to compare/bitcast/zext: 520 // e.g. PMOVMSKB(v16i8 x): 521 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 522 // %int = bitcast <16 x i1> %cmp to i16 523 // %res = zext i16 %int to i32 524 unsigned NumElts = ArgTy->getNumElements(); 525 Type *IntegerTy = Builder.getIntNTy(NumElts); 526 527 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 528 Res = Builder.CreateIsNeg(Res); 529 Res = Builder.CreateBitCast(Res, IntegerTy); 530 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 531 return Res; 532 } 533 534 static Value *simplifyX86addcarry(const IntrinsicInst &II, 535 InstCombiner::BuilderTy &Builder) { 536 Value *CarryIn = II.getArgOperand(0); 537 Value *Op1 = II.getArgOperand(1); 538 Value *Op2 = II.getArgOperand(2); 539 Type *RetTy = II.getType(); 540 Type *OpTy = Op1->getType(); 541 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 542 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 543 "Unexpected types for x86 addcarry"); 544 545 // If carry-in is zero, this is just an unsigned add with overflow. 546 if (match(CarryIn, PatternMatch::m_ZeroInt())) { 547 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 548 {Op1, Op2}); 549 // The types have to be adjusted to match the x86 call types. 550 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 551 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 552 Builder.getInt8Ty()); 553 Value *Res = PoisonValue::get(RetTy); 554 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 555 return Builder.CreateInsertValue(Res, UAddResult, 1); 556 } 557 558 return nullptr; 559 } 560 561 static Value *simplifyTernarylogic(const IntrinsicInst &II, 562 InstCombiner::BuilderTy &Builder) { 563 564 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3)); 565 if (!ArgImm || ArgImm->getValue().uge(256)) 566 return nullptr; 567 568 Value *ArgA = II.getArgOperand(0); 569 Value *ArgB = II.getArgOperand(1); 570 Value *ArgC = II.getArgOperand(2); 571 572 Type *Ty = II.getType(); 573 574 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 575 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second}; 576 }; 577 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 578 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second}; 579 }; 580 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 581 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second}; 582 }; 583 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> { 584 return {Builder.CreateNot(V.first), ~V.second}; 585 }; 586 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); }; 587 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); }; 588 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); }; 589 590 bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant()); 591 bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant()); 592 bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant()); 593 594 bool ABIsConst = AIsConst && BIsConst; 595 bool ACIsConst = AIsConst && CIsConst; 596 bool BCIsConst = BIsConst && CIsConst; 597 bool ABCIsConst = AIsConst && BIsConst && CIsConst; 598 599 // Use for verification. Its a big table. Its difficult to go from Imm -> 600 // logic ops, but easy to verify that a set of logic ops is correct. We track 601 // the logic ops through the second value in the pair. At the end it should 602 // equal Imm. 603 std::pair<Value *, uint8_t> A = {ArgA, 0xf0}; 604 std::pair<Value *, uint8_t> B = {ArgB, 0xcc}; 605 std::pair<Value *, uint8_t> C = {ArgC, 0xaa}; 606 std::pair<Value *, uint8_t> Res = {nullptr, 0}; 607 608 // Currently we only handle cases that convert directly to another instruction 609 // or cases where all the ops are constant. This is because we don't properly 610 // handle creating ternary ops in the backend, so splitting them here may 611 // cause regressions. As the backend improves, uncomment more cases. 612 613 uint8_t Imm = ArgImm->getValue().getZExtValue(); 614 switch (Imm) { 615 case 0x0: 616 Res = {Constant::getNullValue(Ty), 0}; 617 break; 618 case 0x1: 619 if (ABCIsConst) 620 Res = Nor(Or(A, B), C); 621 break; 622 case 0x2: 623 if (ABCIsConst) 624 Res = And(Nor(A, B), C); 625 break; 626 case 0x3: 627 if (ABIsConst) 628 Res = Nor(A, B); 629 break; 630 case 0x4: 631 if (ABCIsConst) 632 Res = And(Nor(A, C), B); 633 break; 634 case 0x5: 635 if (ACIsConst) 636 Res = Nor(A, C); 637 break; 638 case 0x6: 639 if (ABCIsConst) 640 Res = Nor(A, Xnor(B, C)); 641 break; 642 case 0x7: 643 if (ABCIsConst) 644 Res = Nor(A, And(B, C)); 645 break; 646 case 0x8: 647 if (ABCIsConst) 648 Res = Nor(A, Nand(B, C)); 649 break; 650 case 0x9: 651 if (ABCIsConst) 652 Res = Nor(A, Xor(B, C)); 653 break; 654 case 0xa: 655 if (ACIsConst) 656 Res = Nor(A, Not(C)); 657 break; 658 case 0xb: 659 if (ABCIsConst) 660 Res = Nor(A, Nor(C, Not(B))); 661 break; 662 case 0xc: 663 if (ABIsConst) 664 Res = Nor(A, Not(B)); 665 break; 666 case 0xd: 667 if (ABCIsConst) 668 Res = Nor(A, Nor(B, Not(C))); 669 break; 670 case 0xe: 671 if (ABCIsConst) 672 Res = Nor(A, Nor(B, C)); 673 break; 674 case 0xf: 675 Res = Not(A); 676 break; 677 case 0x10: 678 if (ABCIsConst) 679 Res = And(A, Nor(B, C)); 680 break; 681 case 0x11: 682 if (BCIsConst) 683 Res = Nor(B, C); 684 break; 685 case 0x12: 686 if (ABCIsConst) 687 Res = Nor(Xnor(A, C), B); 688 break; 689 case 0x13: 690 if (ABCIsConst) 691 Res = Nor(And(A, C), B); 692 break; 693 case 0x14: 694 if (ABCIsConst) 695 Res = Nor(Xnor(A, B), C); 696 break; 697 case 0x15: 698 if (ABCIsConst) 699 Res = Nor(And(A, B), C); 700 break; 701 case 0x16: 702 if (ABCIsConst) 703 Res = Xor(Xor(A, B), And(Nand(A, B), C)); 704 break; 705 case 0x17: 706 if (ABCIsConst) 707 Res = Xor(Or(A, B), Or(Xnor(A, B), C)); 708 break; 709 case 0x18: 710 if (ABCIsConst) 711 Res = Nor(Xnor(A, B), Xnor(A, C)); 712 break; 713 case 0x19: 714 if (ABCIsConst) 715 Res = And(Nand(A, B), Xnor(B, C)); 716 break; 717 case 0x1a: 718 if (ABCIsConst) 719 Res = Xor(A, Or(And(A, B), C)); 720 break; 721 case 0x1b: 722 if (ABCIsConst) 723 Res = Xor(A, Or(Xnor(A, B), C)); 724 break; 725 case 0x1c: 726 if (ABCIsConst) 727 Res = Xor(A, Or(And(A, C), B)); 728 break; 729 case 0x1d: 730 if (ABCIsConst) 731 Res = Xor(A, Or(Xnor(A, C), B)); 732 break; 733 case 0x1e: 734 if (ABCIsConst) 735 Res = Xor(A, Or(B, C)); 736 break; 737 case 0x1f: 738 if (ABCIsConst) 739 Res = Nand(A, Or(B, C)); 740 break; 741 case 0x20: 742 if (ABCIsConst) 743 Res = Nor(Nand(A, C), B); 744 break; 745 case 0x21: 746 if (ABCIsConst) 747 Res = Nor(Xor(A, C), B); 748 break; 749 case 0x22: 750 if (BCIsConst) 751 Res = Nor(B, Not(C)); 752 break; 753 case 0x23: 754 if (ABCIsConst) 755 Res = Nor(B, Nor(C, Not(A))); 756 break; 757 case 0x24: 758 if (ABCIsConst) 759 Res = Nor(Xnor(A, B), Xor(A, C)); 760 break; 761 case 0x25: 762 if (ABCIsConst) 763 Res = Xor(A, Nand(Nand(A, B), C)); 764 break; 765 case 0x26: 766 if (ABCIsConst) 767 Res = And(Nand(A, B), Xor(B, C)); 768 break; 769 case 0x27: 770 if (ABCIsConst) 771 Res = Xor(Or(Xnor(A, B), C), B); 772 break; 773 case 0x28: 774 if (ABCIsConst) 775 Res = And(Xor(A, B), C); 776 break; 777 case 0x29: 778 if (ABCIsConst) 779 Res = Xor(Xor(A, B), Nor(And(A, B), C)); 780 break; 781 case 0x2a: 782 if (ABCIsConst) 783 Res = And(Nand(A, B), C); 784 break; 785 case 0x2b: 786 if (ABCIsConst) 787 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A); 788 break; 789 case 0x2c: 790 if (ABCIsConst) 791 Res = Nor(Xnor(A, B), Nor(B, C)); 792 break; 793 case 0x2d: 794 if (ABCIsConst) 795 Res = Xor(A, Or(B, Not(C))); 796 break; 797 case 0x2e: 798 if (ABCIsConst) 799 Res = Xor(A, Or(Xor(A, C), B)); 800 break; 801 case 0x2f: 802 if (ABCIsConst) 803 Res = Nand(A, Or(B, Not(C))); 804 break; 805 case 0x30: 806 if (ABIsConst) 807 Res = Nor(B, Not(A)); 808 break; 809 case 0x31: 810 if (ABCIsConst) 811 Res = Nor(Nor(A, Not(C)), B); 812 break; 813 case 0x32: 814 if (ABCIsConst) 815 Res = Nor(Nor(A, C), B); 816 break; 817 case 0x33: 818 Res = Not(B); 819 break; 820 case 0x34: 821 if (ABCIsConst) 822 Res = And(Xor(A, B), Nand(B, C)); 823 break; 824 case 0x35: 825 if (ABCIsConst) 826 Res = Xor(B, Or(A, Xnor(B, C))); 827 break; 828 case 0x36: 829 if (ABCIsConst) 830 Res = Xor(Or(A, C), B); 831 break; 832 case 0x37: 833 if (ABCIsConst) 834 Res = Nand(Or(A, C), B); 835 break; 836 case 0x38: 837 if (ABCIsConst) 838 Res = Nor(Xnor(A, B), Nor(A, C)); 839 break; 840 case 0x39: 841 if (ABCIsConst) 842 Res = Xor(Or(A, Not(C)), B); 843 break; 844 case 0x3a: 845 if (ABCIsConst) 846 Res = Xor(B, Or(A, Xor(B, C))); 847 break; 848 case 0x3b: 849 if (ABCIsConst) 850 Res = Nand(Or(A, Not(C)), B); 851 break; 852 case 0x3c: 853 Res = Xor(A, B); 854 break; 855 case 0x3d: 856 if (ABCIsConst) 857 Res = Xor(A, Or(Nor(A, C), B)); 858 break; 859 case 0x3e: 860 if (ABCIsConst) 861 Res = Xor(A, Or(Nor(A, Not(C)), B)); 862 break; 863 case 0x3f: 864 if (ABIsConst) 865 Res = Nand(A, B); 866 break; 867 case 0x40: 868 if (ABCIsConst) 869 Res = Nor(Nand(A, B), C); 870 break; 871 case 0x41: 872 if (ABCIsConst) 873 Res = Nor(Xor(A, B), C); 874 break; 875 case 0x42: 876 if (ABCIsConst) 877 Res = Nor(Xor(A, B), Xnor(A, C)); 878 break; 879 case 0x43: 880 if (ABCIsConst) 881 Res = Xor(A, Nand(Nand(A, C), B)); 882 break; 883 case 0x44: 884 if (BCIsConst) 885 Res = Nor(C, Not(B)); 886 break; 887 case 0x45: 888 if (ABCIsConst) 889 Res = Nor(Nor(B, Not(A)), C); 890 break; 891 case 0x46: 892 if (ABCIsConst) 893 Res = Xor(Or(And(A, C), B), C); 894 break; 895 case 0x47: 896 if (ABCIsConst) 897 Res = Xor(Or(Xnor(A, C), B), C); 898 break; 899 case 0x48: 900 if (ABCIsConst) 901 Res = And(Xor(A, C), B); 902 break; 903 case 0x49: 904 if (ABCIsConst) 905 Res = Xor(Or(Xnor(A, B), And(A, C)), C); 906 break; 907 case 0x4a: 908 if (ABCIsConst) 909 Res = Nor(Xnor(A, C), Nor(B, C)); 910 break; 911 case 0x4b: 912 if (ABCIsConst) 913 Res = Xor(A, Or(C, Not(B))); 914 break; 915 case 0x4c: 916 if (ABCIsConst) 917 Res = And(Nand(A, C), B); 918 break; 919 case 0x4d: 920 if (ABCIsConst) 921 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A); 922 break; 923 case 0x4e: 924 if (ABCIsConst) 925 Res = Xor(A, Or(Xor(A, B), C)); 926 break; 927 case 0x4f: 928 if (ABCIsConst) 929 Res = Nand(A, Nand(B, Not(C))); 930 break; 931 case 0x50: 932 if (ACIsConst) 933 Res = Nor(C, Not(A)); 934 break; 935 case 0x51: 936 if (ABCIsConst) 937 Res = Nor(Nor(A, Not(B)), C); 938 break; 939 case 0x52: 940 if (ABCIsConst) 941 Res = And(Xor(A, C), Nand(B, C)); 942 break; 943 case 0x53: 944 if (ABCIsConst) 945 Res = Xor(Or(Xnor(B, C), A), C); 946 break; 947 case 0x54: 948 if (ABCIsConst) 949 Res = Nor(Nor(A, B), C); 950 break; 951 case 0x55: 952 Res = Not(C); 953 break; 954 case 0x56: 955 if (ABCIsConst) 956 Res = Xor(Or(A, B), C); 957 break; 958 case 0x57: 959 if (ABCIsConst) 960 Res = Nand(Or(A, B), C); 961 break; 962 case 0x58: 963 if (ABCIsConst) 964 Res = Nor(Nor(A, B), Xnor(A, C)); 965 break; 966 case 0x59: 967 if (ABCIsConst) 968 Res = Xor(Or(A, Not(B)), C); 969 break; 970 case 0x5a: 971 Res = Xor(A, C); 972 break; 973 case 0x5b: 974 if (ABCIsConst) 975 Res = Xor(A, Or(Nor(A, B), C)); 976 break; 977 case 0x5c: 978 if (ABCIsConst) 979 Res = Xor(Or(Xor(B, C), A), C); 980 break; 981 case 0x5d: 982 if (ABCIsConst) 983 Res = Nand(Or(A, Not(B)), C); 984 break; 985 case 0x5e: 986 if (ABCIsConst) 987 Res = Xor(A, Or(Nor(A, Not(B)), C)); 988 break; 989 case 0x5f: 990 if (ACIsConst) 991 Res = Nand(A, C); 992 break; 993 case 0x60: 994 if (ABCIsConst) 995 Res = And(A, Xor(B, C)); 996 break; 997 case 0x61: 998 if (ABCIsConst) 999 Res = Xor(Or(Xnor(A, B), And(B, C)), C); 1000 break; 1001 case 0x62: 1002 if (ABCIsConst) 1003 Res = Nor(Nor(A, C), Xnor(B, C)); 1004 break; 1005 case 0x63: 1006 if (ABCIsConst) 1007 Res = Xor(B, Or(C, Not(A))); 1008 break; 1009 case 0x64: 1010 if (ABCIsConst) 1011 Res = Nor(Nor(A, B), Xnor(B, C)); 1012 break; 1013 case 0x65: 1014 if (ABCIsConst) 1015 Res = Xor(Or(B, Not(A)), C); 1016 break; 1017 case 0x66: 1018 Res = Xor(B, C); 1019 break; 1020 case 0x67: 1021 if (ABCIsConst) 1022 Res = Or(Nor(A, B), Xor(B, C)); 1023 break; 1024 case 0x68: 1025 if (ABCIsConst) 1026 Res = Xor(Xor(A, B), Nor(Nor(A, B), C)); 1027 break; 1028 case 0x69: 1029 if (ABCIsConst) 1030 Res = Xor(Xnor(A, B), C); 1031 break; 1032 case 0x6a: 1033 if (ABCIsConst) 1034 Res = Xor(And(A, B), C); 1035 break; 1036 case 0x6b: 1037 if (ABCIsConst) 1038 Res = Or(Nor(A, B), Xor(Xnor(A, B), C)); 1039 break; 1040 case 0x6c: 1041 if (ABCIsConst) 1042 Res = Xor(And(A, C), B); 1043 break; 1044 case 0x6d: 1045 if (ABCIsConst) 1046 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C); 1047 break; 1048 case 0x6e: 1049 if (ABCIsConst) 1050 Res = Or(Nor(A, Not(B)), Xor(B, C)); 1051 break; 1052 case 0x6f: 1053 if (ABCIsConst) 1054 Res = Nand(A, Xnor(B, C)); 1055 break; 1056 case 0x70: 1057 if (ABCIsConst) 1058 Res = And(A, Nand(B, C)); 1059 break; 1060 case 0x71: 1061 if (ABCIsConst) 1062 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A); 1063 break; 1064 case 0x72: 1065 if (ABCIsConst) 1066 Res = Xor(Or(Xor(A, B), C), B); 1067 break; 1068 case 0x73: 1069 if (ABCIsConst) 1070 Res = Nand(Nand(A, Not(C)), B); 1071 break; 1072 case 0x74: 1073 if (ABCIsConst) 1074 Res = Xor(Or(Xor(A, C), B), C); 1075 break; 1076 case 0x75: 1077 if (ABCIsConst) 1078 Res = Nand(Nand(A, Not(B)), C); 1079 break; 1080 case 0x76: 1081 if (ABCIsConst) 1082 Res = Xor(B, Or(Nor(B, Not(A)), C)); 1083 break; 1084 case 0x77: 1085 if (BCIsConst) 1086 Res = Nand(B, C); 1087 break; 1088 case 0x78: 1089 if (ABCIsConst) 1090 Res = Xor(A, And(B, C)); 1091 break; 1092 case 0x79: 1093 if (ABCIsConst) 1094 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C); 1095 break; 1096 case 0x7a: 1097 if (ABCIsConst) 1098 Res = Or(Xor(A, C), Nor(B, Not(A))); 1099 break; 1100 case 0x7b: 1101 if (ABCIsConst) 1102 Res = Nand(Xnor(A, C), B); 1103 break; 1104 case 0x7c: 1105 if (ABCIsConst) 1106 Res = Or(Xor(A, B), Nor(C, Not(A))); 1107 break; 1108 case 0x7d: 1109 if (ABCIsConst) 1110 Res = Nand(Xnor(A, B), C); 1111 break; 1112 case 0x7e: 1113 if (ABCIsConst) 1114 Res = Or(Xor(A, B), Xor(A, C)); 1115 break; 1116 case 0x7f: 1117 if (ABCIsConst) 1118 Res = Nand(And(A, B), C); 1119 break; 1120 case 0x80: 1121 if (ABCIsConst) 1122 Res = And(And(A, B), C); 1123 break; 1124 case 0x81: 1125 if (ABCIsConst) 1126 Res = Nor(Xor(A, B), Xor(A, C)); 1127 break; 1128 case 0x82: 1129 if (ABCIsConst) 1130 Res = And(Xnor(A, B), C); 1131 break; 1132 case 0x83: 1133 if (ABCIsConst) 1134 Res = Nor(Xor(A, B), Nor(C, Not(A))); 1135 break; 1136 case 0x84: 1137 if (ABCIsConst) 1138 Res = And(Xnor(A, C), B); 1139 break; 1140 case 0x85: 1141 if (ABCIsConst) 1142 Res = Nor(Xor(A, C), Nor(B, Not(A))); 1143 break; 1144 case 0x86: 1145 if (ABCIsConst) 1146 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C); 1147 break; 1148 case 0x87: 1149 if (ABCIsConst) 1150 Res = Xor(A, Nand(B, C)); 1151 break; 1152 case 0x88: 1153 Res = And(B, C); 1154 break; 1155 case 0x89: 1156 if (ABCIsConst) 1157 Res = Xor(B, Nor(Nor(B, Not(A)), C)); 1158 break; 1159 case 0x8a: 1160 if (ABCIsConst) 1161 Res = And(Nand(A, Not(B)), C); 1162 break; 1163 case 0x8b: 1164 if (ABCIsConst) 1165 Res = Xor(Nor(Xor(A, C), B), C); 1166 break; 1167 case 0x8c: 1168 if (ABCIsConst) 1169 Res = And(Nand(A, Not(C)), B); 1170 break; 1171 case 0x8d: 1172 if (ABCIsConst) 1173 Res = Xor(Nor(Xor(A, B), C), B); 1174 break; 1175 case 0x8e: 1176 if (ABCIsConst) 1177 Res = Xor(Or(Xor(A, B), Xor(A, C)), A); 1178 break; 1179 case 0x8f: 1180 if (ABCIsConst) 1181 Res = Nand(A, Nand(B, C)); 1182 break; 1183 case 0x90: 1184 if (ABCIsConst) 1185 Res = And(A, Xnor(B, C)); 1186 break; 1187 case 0x91: 1188 if (ABCIsConst) 1189 Res = Nor(Nor(A, Not(B)), Xor(B, C)); 1190 break; 1191 case 0x92: 1192 if (ABCIsConst) 1193 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C); 1194 break; 1195 case 0x93: 1196 if (ABCIsConst) 1197 Res = Xor(Nand(A, C), B); 1198 break; 1199 case 0x94: 1200 if (ABCIsConst) 1201 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C)); 1202 break; 1203 case 0x95: 1204 if (ABCIsConst) 1205 Res = Xor(Nand(A, B), C); 1206 break; 1207 case 0x96: 1208 if (ABCIsConst) 1209 Res = Xor(Xor(A, B), C); 1210 break; 1211 case 0x97: 1212 if (ABCIsConst) 1213 Res = Xor(Xor(A, B), Or(Nor(A, B), C)); 1214 break; 1215 case 0x98: 1216 if (ABCIsConst) 1217 Res = Nor(Nor(A, B), Xor(B, C)); 1218 break; 1219 case 0x99: 1220 if (BCIsConst) 1221 Res = Xnor(B, C); 1222 break; 1223 case 0x9a: 1224 if (ABCIsConst) 1225 Res = Xor(Nor(B, Not(A)), C); 1226 break; 1227 case 0x9b: 1228 if (ABCIsConst) 1229 Res = Or(Nor(A, B), Xnor(B, C)); 1230 break; 1231 case 0x9c: 1232 if (ABCIsConst) 1233 Res = Xor(B, Nor(C, Not(A))); 1234 break; 1235 case 0x9d: 1236 if (ABCIsConst) 1237 Res = Or(Nor(A, C), Xnor(B, C)); 1238 break; 1239 case 0x9e: 1240 if (ABCIsConst) 1241 Res = Xor(And(Xor(A, B), Nand(B, C)), C); 1242 break; 1243 case 0x9f: 1244 if (ABCIsConst) 1245 Res = Nand(A, Xor(B, C)); 1246 break; 1247 case 0xa0: 1248 Res = And(A, C); 1249 break; 1250 case 0xa1: 1251 if (ABCIsConst) 1252 Res = Xor(A, Nor(Nor(A, Not(B)), C)); 1253 break; 1254 case 0xa2: 1255 if (ABCIsConst) 1256 Res = And(Or(A, Not(B)), C); 1257 break; 1258 case 0xa3: 1259 if (ABCIsConst) 1260 Res = Xor(Nor(Xor(B, C), A), C); 1261 break; 1262 case 0xa4: 1263 if (ABCIsConst) 1264 Res = Xor(A, Nor(Nor(A, B), C)); 1265 break; 1266 case 0xa5: 1267 if (ACIsConst) 1268 Res = Xnor(A, C); 1269 break; 1270 case 0xa6: 1271 if (ABCIsConst) 1272 Res = Xor(Nor(A, Not(B)), C); 1273 break; 1274 case 0xa7: 1275 if (ABCIsConst) 1276 Res = Or(Nor(A, B), Xnor(A, C)); 1277 break; 1278 case 0xa8: 1279 if (ABCIsConst) 1280 Res = And(Or(A, B), C); 1281 break; 1282 case 0xa9: 1283 if (ABCIsConst) 1284 Res = Xor(Nor(A, B), C); 1285 break; 1286 case 0xaa: 1287 Res = C; 1288 break; 1289 case 0xab: 1290 if (ABCIsConst) 1291 Res = Or(Nor(A, B), C); 1292 break; 1293 case 0xac: 1294 if (ABCIsConst) 1295 Res = Xor(Nor(Xnor(B, C), A), C); 1296 break; 1297 case 0xad: 1298 if (ABCIsConst) 1299 Res = Or(Xnor(A, C), And(B, C)); 1300 break; 1301 case 0xae: 1302 if (ABCIsConst) 1303 Res = Or(Nor(A, Not(B)), C); 1304 break; 1305 case 0xaf: 1306 if (ACIsConst) 1307 Res = Or(C, Not(A)); 1308 break; 1309 case 0xb0: 1310 if (ABCIsConst) 1311 Res = And(A, Nand(B, Not(C))); 1312 break; 1313 case 0xb1: 1314 if (ABCIsConst) 1315 Res = Xor(A, Nor(Xor(A, B), C)); 1316 break; 1317 case 0xb2: 1318 if (ABCIsConst) 1319 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A); 1320 break; 1321 case 0xb3: 1322 if (ABCIsConst) 1323 Res = Nand(Nand(A, C), B); 1324 break; 1325 case 0xb4: 1326 if (ABCIsConst) 1327 Res = Xor(A, Nor(C, Not(B))); 1328 break; 1329 case 0xb5: 1330 if (ABCIsConst) 1331 Res = Or(Xnor(A, C), Nor(B, C)); 1332 break; 1333 case 0xb6: 1334 if (ABCIsConst) 1335 Res = Xor(And(Xor(A, B), Nand(A, C)), C); 1336 break; 1337 case 0xb7: 1338 if (ABCIsConst) 1339 Res = Nand(Xor(A, C), B); 1340 break; 1341 case 0xb8: 1342 if (ABCIsConst) 1343 Res = Xor(Nor(Xnor(A, C), B), C); 1344 break; 1345 case 0xb9: 1346 if (ABCIsConst) 1347 Res = Xor(Nor(And(A, C), B), C); 1348 break; 1349 case 0xba: 1350 if (ABCIsConst) 1351 Res = Or(Nor(B, Not(A)), C); 1352 break; 1353 case 0xbb: 1354 if (BCIsConst) 1355 Res = Or(C, Not(B)); 1356 break; 1357 case 0xbc: 1358 if (ABCIsConst) 1359 Res = Xor(A, And(Nand(A, C), B)); 1360 break; 1361 case 0xbd: 1362 if (ABCIsConst) 1363 Res = Or(Xor(A, B), Xnor(A, C)); 1364 break; 1365 case 0xbe: 1366 if (ABCIsConst) 1367 Res = Or(Xor(A, B), C); 1368 break; 1369 case 0xbf: 1370 if (ABCIsConst) 1371 Res = Or(Nand(A, B), C); 1372 break; 1373 case 0xc0: 1374 Res = And(A, B); 1375 break; 1376 case 0xc1: 1377 if (ABCIsConst) 1378 Res = Xor(A, Nor(Nor(A, Not(C)), B)); 1379 break; 1380 case 0xc2: 1381 if (ABCIsConst) 1382 Res = Xor(A, Nor(Nor(A, C), B)); 1383 break; 1384 case 0xc3: 1385 if (ABIsConst) 1386 Res = Xnor(A, B); 1387 break; 1388 case 0xc4: 1389 if (ABCIsConst) 1390 Res = And(Or(A, Not(C)), B); 1391 break; 1392 case 0xc5: 1393 if (ABCIsConst) 1394 Res = Xor(B, Nor(A, Xor(B, C))); 1395 break; 1396 case 0xc6: 1397 if (ABCIsConst) 1398 Res = Xor(Nor(A, Not(C)), B); 1399 break; 1400 case 0xc7: 1401 if (ABCIsConst) 1402 Res = Or(Xnor(A, B), Nor(A, C)); 1403 break; 1404 case 0xc8: 1405 if (ABCIsConst) 1406 Res = And(Or(A, C), B); 1407 break; 1408 case 0xc9: 1409 if (ABCIsConst) 1410 Res = Xor(Nor(A, C), B); 1411 break; 1412 case 0xca: 1413 if (ABCIsConst) 1414 Res = Xor(B, Nor(A, Xnor(B, C))); 1415 break; 1416 case 0xcb: 1417 if (ABCIsConst) 1418 Res = Or(Xnor(A, B), And(B, C)); 1419 break; 1420 case 0xcc: 1421 Res = B; 1422 break; 1423 case 0xcd: 1424 if (ABCIsConst) 1425 Res = Or(Nor(A, C), B); 1426 break; 1427 case 0xce: 1428 if (ABCIsConst) 1429 Res = Or(Nor(A, Not(C)), B); 1430 break; 1431 case 0xcf: 1432 if (ABIsConst) 1433 Res = Or(B, Not(A)); 1434 break; 1435 case 0xd0: 1436 if (ABCIsConst) 1437 Res = And(A, Or(B, Not(C))); 1438 break; 1439 case 0xd1: 1440 if (ABCIsConst) 1441 Res = Xor(A, Nor(Xor(A, C), B)); 1442 break; 1443 case 0xd2: 1444 if (ABCIsConst) 1445 Res = Xor(A, Nor(B, Not(C))); 1446 break; 1447 case 0xd3: 1448 if (ABCIsConst) 1449 Res = Or(Xnor(A, B), Nor(B, C)); 1450 break; 1451 case 0xd4: 1452 if (ABCIsConst) 1453 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A); 1454 break; 1455 case 0xd5: 1456 if (ABCIsConst) 1457 Res = Nand(Nand(A, B), C); 1458 break; 1459 case 0xd6: 1460 if (ABCIsConst) 1461 Res = Xor(Xor(A, B), Or(And(A, B), C)); 1462 break; 1463 case 0xd7: 1464 if (ABCIsConst) 1465 Res = Nand(Xor(A, B), C); 1466 break; 1467 case 0xd8: 1468 if (ABCIsConst) 1469 Res = Xor(Nor(Xnor(A, B), C), B); 1470 break; 1471 case 0xd9: 1472 if (ABCIsConst) 1473 Res = Or(And(A, B), Xnor(B, C)); 1474 break; 1475 case 0xda: 1476 if (ABCIsConst) 1477 Res = Xor(A, And(Nand(A, B), C)); 1478 break; 1479 case 0xdb: 1480 if (ABCIsConst) 1481 Res = Or(Xnor(A, B), Xor(A, C)); 1482 break; 1483 case 0xdc: 1484 if (ABCIsConst) 1485 Res = Or(B, Nor(C, Not(A))); 1486 break; 1487 case 0xdd: 1488 if (BCIsConst) 1489 Res = Or(B, Not(C)); 1490 break; 1491 case 0xde: 1492 if (ABCIsConst) 1493 Res = Or(Xor(A, C), B); 1494 break; 1495 case 0xdf: 1496 if (ABCIsConst) 1497 Res = Or(Nand(A, C), B); 1498 break; 1499 case 0xe0: 1500 if (ABCIsConst) 1501 Res = And(A, Or(B, C)); 1502 break; 1503 case 0xe1: 1504 if (ABCIsConst) 1505 Res = Xor(A, Nor(B, C)); 1506 break; 1507 case 0xe2: 1508 if (ABCIsConst) 1509 Res = Xor(A, Nor(Xnor(A, C), B)); 1510 break; 1511 case 0xe3: 1512 if (ABCIsConst) 1513 Res = Xor(A, Nor(And(A, C), B)); 1514 break; 1515 case 0xe4: 1516 if (ABCIsConst) 1517 Res = Xor(A, Nor(Xnor(A, B), C)); 1518 break; 1519 case 0xe5: 1520 if (ABCIsConst) 1521 Res = Xor(A, Nor(And(A, B), C)); 1522 break; 1523 case 0xe6: 1524 if (ABCIsConst) 1525 Res = Or(And(A, B), Xor(B, C)); 1526 break; 1527 case 0xe7: 1528 if (ABCIsConst) 1529 Res = Or(Xnor(A, B), Xnor(A, C)); 1530 break; 1531 case 0xe8: 1532 if (ABCIsConst) 1533 Res = Xor(Or(A, B), Nor(Xnor(A, B), C)); 1534 break; 1535 case 0xe9: 1536 if (ABCIsConst) 1537 Res = Xor(Xor(A, B), Nand(Nand(A, B), C)); 1538 break; 1539 case 0xea: 1540 if (ABCIsConst) 1541 Res = Or(And(A, B), C); 1542 break; 1543 case 0xeb: 1544 if (ABCIsConst) 1545 Res = Or(Xnor(A, B), C); 1546 break; 1547 case 0xec: 1548 if (ABCIsConst) 1549 Res = Or(And(A, C), B); 1550 break; 1551 case 0xed: 1552 if (ABCIsConst) 1553 Res = Or(Xnor(A, C), B); 1554 break; 1555 case 0xee: 1556 Res = Or(B, C); 1557 break; 1558 case 0xef: 1559 if (ABCIsConst) 1560 Res = Nand(A, Nor(B, C)); 1561 break; 1562 case 0xf0: 1563 Res = A; 1564 break; 1565 case 0xf1: 1566 if (ABCIsConst) 1567 Res = Or(A, Nor(B, C)); 1568 break; 1569 case 0xf2: 1570 if (ABCIsConst) 1571 Res = Or(A, Nor(B, Not(C))); 1572 break; 1573 case 0xf3: 1574 if (ABIsConst) 1575 Res = Or(A, Not(B)); 1576 break; 1577 case 0xf4: 1578 if (ABCIsConst) 1579 Res = Or(A, Nor(C, Not(B))); 1580 break; 1581 case 0xf5: 1582 if (ACIsConst) 1583 Res = Or(A, Not(C)); 1584 break; 1585 case 0xf6: 1586 if (ABCIsConst) 1587 Res = Or(A, Xor(B, C)); 1588 break; 1589 case 0xf7: 1590 if (ABCIsConst) 1591 Res = Or(A, Nand(B, C)); 1592 break; 1593 case 0xf8: 1594 if (ABCIsConst) 1595 Res = Or(A, And(B, C)); 1596 break; 1597 case 0xf9: 1598 if (ABCIsConst) 1599 Res = Or(A, Xnor(B, C)); 1600 break; 1601 case 0xfa: 1602 Res = Or(A, C); 1603 break; 1604 case 0xfb: 1605 if (ABCIsConst) 1606 Res = Nand(Nor(A, C), B); 1607 break; 1608 case 0xfc: 1609 Res = Or(A, B); 1610 break; 1611 case 0xfd: 1612 if (ABCIsConst) 1613 Res = Nand(Nor(A, B), C); 1614 break; 1615 case 0xfe: 1616 if (ABCIsConst) 1617 Res = Or(Or(A, B), C); 1618 break; 1619 case 0xff: 1620 Res = {Constant::getAllOnesValue(Ty), 0xff}; 1621 break; 1622 } 1623 1624 assert((Res.first == nullptr || Res.second == Imm) && 1625 "Simplification of ternary logic does not verify!"); 1626 return Res.first; 1627 } 1628 1629 static Value *simplifyX86insertps(const IntrinsicInst &II, 1630 InstCombiner::BuilderTy &Builder) { 1631 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1632 if (!CInt) 1633 return nullptr; 1634 1635 auto *VecTy = cast<FixedVectorType>(II.getType()); 1636 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 1637 1638 // The immediate permute control byte looks like this: 1639 // [3:0] - zero mask for each 32-bit lane 1640 // [5:4] - select one 32-bit destination lane 1641 // [7:6] - select one 32-bit source lane 1642 1643 uint8_t Imm = CInt->getZExtValue(); 1644 uint8_t ZMask = Imm & 0xf; 1645 uint8_t DestLane = (Imm >> 4) & 0x3; 1646 uint8_t SourceLane = (Imm >> 6) & 0x3; 1647 1648 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 1649 1650 // If all zero mask bits are set, this was just a weird way to 1651 // generate a zero vector. 1652 if (ZMask == 0xf) 1653 return ZeroVector; 1654 1655 // Initialize by passing all of the first source bits through. 1656 int ShuffleMask[4] = {0, 1, 2, 3}; 1657 1658 // We may replace the second operand with the zero vector. 1659 Value *V1 = II.getArgOperand(1); 1660 1661 if (ZMask) { 1662 // If the zero mask is being used with a single input or the zero mask 1663 // overrides the destination lane, this is a shuffle with the zero vector. 1664 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 1665 (ZMask & (1 << DestLane))) { 1666 V1 = ZeroVector; 1667 // We may still move 32-bits of the first source vector from one lane 1668 // to another. 1669 ShuffleMask[DestLane] = SourceLane; 1670 // The zero mask may override the previous insert operation. 1671 for (unsigned i = 0; i < 4; ++i) 1672 if ((ZMask >> i) & 0x1) 1673 ShuffleMask[i] = i + 4; 1674 } else { 1675 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 1676 return nullptr; 1677 } 1678 } else { 1679 // Replace the selected destination lane with the selected source lane. 1680 ShuffleMask[DestLane] = SourceLane + 4; 1681 } 1682 1683 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 1684 } 1685 1686 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 1687 /// or conversion to a shuffle vector. 1688 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 1689 ConstantInt *CILength, ConstantInt *CIIndex, 1690 InstCombiner::BuilderTy &Builder) { 1691 auto LowConstantHighUndef = [&](uint64_t Val) { 1692 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1693 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 1694 UndefValue::get(IntTy64)}; 1695 return ConstantVector::get(Args); 1696 }; 1697 1698 // See if we're dealing with constant values. 1699 auto *C0 = dyn_cast<Constant>(Op0); 1700 auto *CI0 = 1701 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1702 : nullptr; 1703 1704 // Attempt to constant fold. 1705 if (CILength && CIIndex) { 1706 // From AMD documentation: "The bit index and field length are each six 1707 // bits in length other bits of the field are ignored." 1708 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 1709 APInt APLength = CILength->getValue().zextOrTrunc(6); 1710 1711 unsigned Index = APIndex.getZExtValue(); 1712 1713 // From AMD documentation: "a value of zero in the field length is 1714 // defined as length of 64". 1715 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1716 1717 // From AMD documentation: "If the sum of the bit index + length field 1718 // is greater than 64, the results are undefined". 1719 unsigned End = Index + Length; 1720 1721 // Note that both field index and field length are 8-bit quantities. 1722 // Since variables 'Index' and 'Length' are unsigned values 1723 // obtained from zero-extending field index and field length 1724 // respectively, their sum should never wrap around. 1725 if (End > 64) 1726 return UndefValue::get(II.getType()); 1727 1728 // If we are inserting whole bytes, we can convert this to a shuffle. 1729 // Lowering can recognize EXTRQI shuffle masks. 1730 if ((Length % 8) == 0 && (Index % 8) == 0) { 1731 // Convert bit indices to byte indices. 1732 Length /= 8; 1733 Index /= 8; 1734 1735 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1736 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1737 1738 SmallVector<int, 16> ShuffleMask; 1739 for (int i = 0; i != (int)Length; ++i) 1740 ShuffleMask.push_back(i + Index); 1741 for (int i = Length; i != 8; ++i) 1742 ShuffleMask.push_back(i + 16); 1743 for (int i = 8; i != 16; ++i) 1744 ShuffleMask.push_back(-1); 1745 1746 Value *SV = Builder.CreateShuffleVector( 1747 Builder.CreateBitCast(Op0, ShufTy), 1748 ConstantAggregateZero::get(ShufTy), ShuffleMask); 1749 return Builder.CreateBitCast(SV, II.getType()); 1750 } 1751 1752 // Constant Fold - shift Index'th bit to lowest position and mask off 1753 // Length bits. 1754 if (CI0) { 1755 APInt Elt = CI0->getValue(); 1756 Elt.lshrInPlace(Index); 1757 Elt = Elt.zextOrTrunc(Length); 1758 return LowConstantHighUndef(Elt.getZExtValue()); 1759 } 1760 1761 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 1762 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 1763 Value *Args[] = {Op0, CILength, CIIndex}; 1764 Module *M = II.getModule(); 1765 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 1766 return Builder.CreateCall(F, Args); 1767 } 1768 } 1769 1770 // Constant Fold - extraction from zero is always {zero, undef}. 1771 if (CI0 && CI0->isZero()) 1772 return LowConstantHighUndef(0); 1773 1774 return nullptr; 1775 } 1776 1777 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 1778 /// folding or conversion to a shuffle vector. 1779 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 1780 APInt APLength, APInt APIndex, 1781 InstCombiner::BuilderTy &Builder) { 1782 // From AMD documentation: "The bit index and field length are each six bits 1783 // in length other bits of the field are ignored." 1784 APIndex = APIndex.zextOrTrunc(6); 1785 APLength = APLength.zextOrTrunc(6); 1786 1787 // Attempt to constant fold. 1788 unsigned Index = APIndex.getZExtValue(); 1789 1790 // From AMD documentation: "a value of zero in the field length is 1791 // defined as length of 64". 1792 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1793 1794 // From AMD documentation: "If the sum of the bit index + length field 1795 // is greater than 64, the results are undefined". 1796 unsigned End = Index + Length; 1797 1798 // Note that both field index and field length are 8-bit quantities. 1799 // Since variables 'Index' and 'Length' are unsigned values 1800 // obtained from zero-extending field index and field length 1801 // respectively, their sum should never wrap around. 1802 if (End > 64) 1803 return UndefValue::get(II.getType()); 1804 1805 // If we are inserting whole bytes, we can convert this to a shuffle. 1806 // Lowering can recognize INSERTQI shuffle masks. 1807 if ((Length % 8) == 0 && (Index % 8) == 0) { 1808 // Convert bit indices to byte indices. 1809 Length /= 8; 1810 Index /= 8; 1811 1812 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1813 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1814 1815 SmallVector<int, 16> ShuffleMask; 1816 for (int i = 0; i != (int)Index; ++i) 1817 ShuffleMask.push_back(i); 1818 for (int i = 0; i != (int)Length; ++i) 1819 ShuffleMask.push_back(i + 16); 1820 for (int i = Index + Length; i != 8; ++i) 1821 ShuffleMask.push_back(i); 1822 for (int i = 8; i != 16; ++i) 1823 ShuffleMask.push_back(-1); 1824 1825 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 1826 Builder.CreateBitCast(Op1, ShufTy), 1827 ShuffleMask); 1828 return Builder.CreateBitCast(SV, II.getType()); 1829 } 1830 1831 // See if we're dealing with constant values. 1832 auto *C0 = dyn_cast<Constant>(Op0); 1833 auto *C1 = dyn_cast<Constant>(Op1); 1834 auto *CI00 = 1835 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1836 : nullptr; 1837 auto *CI10 = 1838 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1839 : nullptr; 1840 1841 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 1842 if (CI00 && CI10) { 1843 APInt V00 = CI00->getValue(); 1844 APInt V10 = CI10->getValue(); 1845 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 1846 V00 = V00 & ~Mask; 1847 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 1848 APInt Val = V00 | V10; 1849 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1850 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 1851 UndefValue::get(IntTy64)}; 1852 return ConstantVector::get(Args); 1853 } 1854 1855 // If we were an INSERTQ call, we'll save demanded elements if we convert to 1856 // INSERTQI. 1857 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 1858 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1859 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 1860 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 1861 1862 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 1863 Module *M = II.getModule(); 1864 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 1865 return Builder.CreateCall(F, Args); 1866 } 1867 1868 return nullptr; 1869 } 1870 1871 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 1872 static Value *simplifyX86pshufb(const IntrinsicInst &II, 1873 InstCombiner::BuilderTy &Builder) { 1874 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1875 if (!V) 1876 return nullptr; 1877 1878 auto *VecTy = cast<FixedVectorType>(II.getType()); 1879 unsigned NumElts = VecTy->getNumElements(); 1880 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 1881 "Unexpected number of elements in shuffle mask!"); 1882 1883 // Construct a shuffle mask from constant integers or UNDEFs. 1884 int Indexes[64]; 1885 1886 // Each byte in the shuffle control mask forms an index to permute the 1887 // corresponding byte in the destination operand. 1888 for (unsigned I = 0; I < NumElts; ++I) { 1889 Constant *COp = V->getAggregateElement(I); 1890 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1891 return nullptr; 1892 1893 if (isa<UndefValue>(COp)) { 1894 Indexes[I] = -1; 1895 continue; 1896 } 1897 1898 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 1899 1900 // If the most significant bit (bit[7]) of each byte of the shuffle 1901 // control mask is set, then zero is written in the result byte. 1902 // The zero vector is in the right-hand side of the resulting 1903 // shufflevector. 1904 1905 // The value of each index for the high 128-bit lane is the least 1906 // significant 4 bits of the respective shuffle control byte. 1907 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 1908 Indexes[I] = Index; 1909 } 1910 1911 auto V1 = II.getArgOperand(0); 1912 auto V2 = Constant::getNullValue(VecTy); 1913 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts)); 1914 } 1915 1916 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 1917 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 1918 InstCombiner::BuilderTy &Builder) { 1919 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1920 if (!V) 1921 return nullptr; 1922 1923 auto *VecTy = cast<FixedVectorType>(II.getType()); 1924 unsigned NumElts = VecTy->getNumElements(); 1925 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 1926 unsigned NumLaneElts = IsPD ? 2 : 4; 1927 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 1928 1929 // Construct a shuffle mask from constant integers or UNDEFs. 1930 int Indexes[16]; 1931 1932 // The intrinsics only read one or two bits, clear the rest. 1933 for (unsigned I = 0; I < NumElts; ++I) { 1934 Constant *COp = V->getAggregateElement(I); 1935 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1936 return nullptr; 1937 1938 if (isa<UndefValue>(COp)) { 1939 Indexes[I] = -1; 1940 continue; 1941 } 1942 1943 APInt Index = cast<ConstantInt>(COp)->getValue(); 1944 Index = Index.zextOrTrunc(32).getLoBits(2); 1945 1946 // The PD variants uses bit 1 to select per-lane element index, so 1947 // shift down to convert to generic shuffle mask index. 1948 if (IsPD) 1949 Index.lshrInPlace(1); 1950 1951 // The _256 variants are a bit trickier since the mask bits always index 1952 // into the corresponding 128 half. In order to convert to a generic 1953 // shuffle, we have to make that explicit. 1954 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 1955 1956 Indexes[I] = Index.getZExtValue(); 1957 } 1958 1959 auto V1 = II.getArgOperand(0); 1960 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts)); 1961 } 1962 1963 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 1964 static Value *simplifyX86vpermv(const IntrinsicInst &II, 1965 InstCombiner::BuilderTy &Builder) { 1966 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1967 if (!V) 1968 return nullptr; 1969 1970 auto *VecTy = cast<FixedVectorType>(II.getType()); 1971 unsigned Size = VecTy->getNumElements(); 1972 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 1973 "Unexpected shuffle mask size"); 1974 1975 // Construct a shuffle mask from constant integers or UNDEFs. 1976 int Indexes[64]; 1977 1978 for (unsigned I = 0; I < Size; ++I) { 1979 Constant *COp = V->getAggregateElement(I); 1980 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1981 return nullptr; 1982 1983 if (isa<UndefValue>(COp)) { 1984 Indexes[I] = -1; 1985 continue; 1986 } 1987 1988 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 1989 Index &= Size - 1; 1990 Indexes[I] = Index; 1991 } 1992 1993 auto V1 = II.getArgOperand(0); 1994 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size)); 1995 } 1996 1997 std::optional<Instruction *> 1998 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 1999 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 2000 unsigned DemandedWidth) { 2001 APInt UndefElts(Width, 0); 2002 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 2003 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 2004 }; 2005 2006 Intrinsic::ID IID = II.getIntrinsicID(); 2007 switch (IID) { 2008 case Intrinsic::x86_bmi_bextr_32: 2009 case Intrinsic::x86_bmi_bextr_64: 2010 case Intrinsic::x86_tbm_bextri_u32: 2011 case Intrinsic::x86_tbm_bextri_u64: 2012 // If the RHS is a constant we can try some simplifications. 2013 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2014 uint64_t Shift = C->getZExtValue(); 2015 uint64_t Length = (Shift >> 8) & 0xff; 2016 Shift &= 0xff; 2017 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2018 // If the length is 0 or the shift is out of range, replace with zero. 2019 if (Length == 0 || Shift >= BitWidth) { 2020 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2021 } 2022 // If the LHS is also a constant, we can completely constant fold this. 2023 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2024 uint64_t Result = InC->getZExtValue() >> Shift; 2025 if (Length > BitWidth) 2026 Length = BitWidth; 2027 Result &= maskTrailingOnes<uint64_t>(Length); 2028 return IC.replaceInstUsesWith(II, 2029 ConstantInt::get(II.getType(), Result)); 2030 } 2031 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2032 // are only masking bits that a shift already cleared? 2033 } 2034 break; 2035 2036 case Intrinsic::x86_bmi_bzhi_32: 2037 case Intrinsic::x86_bmi_bzhi_64: 2038 // If the RHS is a constant we can try some simplifications. 2039 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2040 uint64_t Index = C->getZExtValue() & 0xff; 2041 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2042 if (Index >= BitWidth) { 2043 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2044 } 2045 if (Index == 0) { 2046 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2047 } 2048 // If the LHS is also a constant, we can completely constant fold this. 2049 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2050 uint64_t Result = InC->getZExtValue(); 2051 Result &= maskTrailingOnes<uint64_t>(Index); 2052 return IC.replaceInstUsesWith(II, 2053 ConstantInt::get(II.getType(), Result)); 2054 } 2055 // TODO should we convert this to an AND if the RHS is constant? 2056 } 2057 break; 2058 case Intrinsic::x86_bmi_pext_32: 2059 case Intrinsic::x86_bmi_pext_64: 2060 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2061 if (MaskC->isNullValue()) { 2062 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2063 } 2064 if (MaskC->isAllOnesValue()) { 2065 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2066 } 2067 2068 unsigned MaskIdx, MaskLen; 2069 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2070 // any single contingous sequence of 1s anywhere in the mask simply 2071 // describes a subset of the input bits shifted to the appropriate 2072 // position. Replace with the straight forward IR. 2073 Value *Input = II.getArgOperand(0); 2074 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 2075 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2076 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 2077 return IC.replaceInstUsesWith(II, Shifted); 2078 } 2079 2080 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2081 uint64_t Src = SrcC->getZExtValue(); 2082 uint64_t Mask = MaskC->getZExtValue(); 2083 uint64_t Result = 0; 2084 uint64_t BitToSet = 1; 2085 2086 while (Mask) { 2087 // Isolate lowest set bit. 2088 uint64_t BitToTest = Mask & -Mask; 2089 if (BitToTest & Src) 2090 Result |= BitToSet; 2091 2092 BitToSet <<= 1; 2093 // Clear lowest set bit. 2094 Mask &= Mask - 1; 2095 } 2096 2097 return IC.replaceInstUsesWith(II, 2098 ConstantInt::get(II.getType(), Result)); 2099 } 2100 } 2101 break; 2102 case Intrinsic::x86_bmi_pdep_32: 2103 case Intrinsic::x86_bmi_pdep_64: 2104 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2105 if (MaskC->isNullValue()) { 2106 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2107 } 2108 if (MaskC->isAllOnesValue()) { 2109 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2110 } 2111 2112 unsigned MaskIdx, MaskLen; 2113 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2114 // any single contingous sequence of 1s anywhere in the mask simply 2115 // describes a subset of the input bits shifted to the appropriate 2116 // position. Replace with the straight forward IR. 2117 Value *Input = II.getArgOperand(0); 2118 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2119 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 2120 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 2121 return IC.replaceInstUsesWith(II, Masked); 2122 } 2123 2124 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2125 uint64_t Src = SrcC->getZExtValue(); 2126 uint64_t Mask = MaskC->getZExtValue(); 2127 uint64_t Result = 0; 2128 uint64_t BitToTest = 1; 2129 2130 while (Mask) { 2131 // Isolate lowest set bit. 2132 uint64_t BitToSet = Mask & -Mask; 2133 if (BitToTest & Src) 2134 Result |= BitToSet; 2135 2136 BitToTest <<= 1; 2137 // Clear lowest set bit; 2138 Mask &= Mask - 1; 2139 } 2140 2141 return IC.replaceInstUsesWith(II, 2142 ConstantInt::get(II.getType(), Result)); 2143 } 2144 } 2145 break; 2146 2147 case Intrinsic::x86_sse_cvtss2si: 2148 case Intrinsic::x86_sse_cvtss2si64: 2149 case Intrinsic::x86_sse_cvttss2si: 2150 case Intrinsic::x86_sse_cvttss2si64: 2151 case Intrinsic::x86_sse2_cvtsd2si: 2152 case Intrinsic::x86_sse2_cvtsd2si64: 2153 case Intrinsic::x86_sse2_cvttsd2si: 2154 case Intrinsic::x86_sse2_cvttsd2si64: 2155 case Intrinsic::x86_avx512_vcvtss2si32: 2156 case Intrinsic::x86_avx512_vcvtss2si64: 2157 case Intrinsic::x86_avx512_vcvtss2usi32: 2158 case Intrinsic::x86_avx512_vcvtss2usi64: 2159 case Intrinsic::x86_avx512_vcvtsd2si32: 2160 case Intrinsic::x86_avx512_vcvtsd2si64: 2161 case Intrinsic::x86_avx512_vcvtsd2usi32: 2162 case Intrinsic::x86_avx512_vcvtsd2usi64: 2163 case Intrinsic::x86_avx512_cvttss2si: 2164 case Intrinsic::x86_avx512_cvttss2si64: 2165 case Intrinsic::x86_avx512_cvttss2usi: 2166 case Intrinsic::x86_avx512_cvttss2usi64: 2167 case Intrinsic::x86_avx512_cvttsd2si: 2168 case Intrinsic::x86_avx512_cvttsd2si64: 2169 case Intrinsic::x86_avx512_cvttsd2usi: 2170 case Intrinsic::x86_avx512_cvttsd2usi64: { 2171 // These intrinsics only demand the 0th element of their input vectors. If 2172 // we can simplify the input based on that, do so now. 2173 Value *Arg = II.getArgOperand(0); 2174 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 2175 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2176 return IC.replaceOperand(II, 0, V); 2177 } 2178 break; 2179 } 2180 2181 case Intrinsic::x86_mmx_pmovmskb: 2182 case Intrinsic::x86_sse_movmsk_ps: 2183 case Intrinsic::x86_sse2_movmsk_pd: 2184 case Intrinsic::x86_sse2_pmovmskb_128: 2185 case Intrinsic::x86_avx_movmsk_pd_256: 2186 case Intrinsic::x86_avx_movmsk_ps_256: 2187 case Intrinsic::x86_avx2_pmovmskb: 2188 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 2189 return IC.replaceInstUsesWith(II, V); 2190 } 2191 break; 2192 2193 case Intrinsic::x86_sse_comieq_ss: 2194 case Intrinsic::x86_sse_comige_ss: 2195 case Intrinsic::x86_sse_comigt_ss: 2196 case Intrinsic::x86_sse_comile_ss: 2197 case Intrinsic::x86_sse_comilt_ss: 2198 case Intrinsic::x86_sse_comineq_ss: 2199 case Intrinsic::x86_sse_ucomieq_ss: 2200 case Intrinsic::x86_sse_ucomige_ss: 2201 case Intrinsic::x86_sse_ucomigt_ss: 2202 case Intrinsic::x86_sse_ucomile_ss: 2203 case Intrinsic::x86_sse_ucomilt_ss: 2204 case Intrinsic::x86_sse_ucomineq_ss: 2205 case Intrinsic::x86_sse2_comieq_sd: 2206 case Intrinsic::x86_sse2_comige_sd: 2207 case Intrinsic::x86_sse2_comigt_sd: 2208 case Intrinsic::x86_sse2_comile_sd: 2209 case Intrinsic::x86_sse2_comilt_sd: 2210 case Intrinsic::x86_sse2_comineq_sd: 2211 case Intrinsic::x86_sse2_ucomieq_sd: 2212 case Intrinsic::x86_sse2_ucomige_sd: 2213 case Intrinsic::x86_sse2_ucomigt_sd: 2214 case Intrinsic::x86_sse2_ucomile_sd: 2215 case Intrinsic::x86_sse2_ucomilt_sd: 2216 case Intrinsic::x86_sse2_ucomineq_sd: 2217 case Intrinsic::x86_avx512_vcomi_ss: 2218 case Intrinsic::x86_avx512_vcomi_sd: 2219 case Intrinsic::x86_avx512_mask_cmp_ss: 2220 case Intrinsic::x86_avx512_mask_cmp_sd: { 2221 // These intrinsics only demand the 0th element of their input vectors. If 2222 // we can simplify the input based on that, do so now. 2223 bool MadeChange = false; 2224 Value *Arg0 = II.getArgOperand(0); 2225 Value *Arg1 = II.getArgOperand(1); 2226 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2227 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2228 IC.replaceOperand(II, 0, V); 2229 MadeChange = true; 2230 } 2231 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2232 IC.replaceOperand(II, 1, V); 2233 MadeChange = true; 2234 } 2235 if (MadeChange) { 2236 return &II; 2237 } 2238 break; 2239 } 2240 2241 case Intrinsic::x86_avx512_add_ps_512: 2242 case Intrinsic::x86_avx512_div_ps_512: 2243 case Intrinsic::x86_avx512_mul_ps_512: 2244 case Intrinsic::x86_avx512_sub_ps_512: 2245 case Intrinsic::x86_avx512_add_pd_512: 2246 case Intrinsic::x86_avx512_div_pd_512: 2247 case Intrinsic::x86_avx512_mul_pd_512: 2248 case Intrinsic::x86_avx512_sub_pd_512: 2249 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2250 // IR operations. 2251 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2252 if (R->getValue() == 4) { 2253 Value *Arg0 = II.getArgOperand(0); 2254 Value *Arg1 = II.getArgOperand(1); 2255 2256 Value *V; 2257 switch (IID) { 2258 default: 2259 llvm_unreachable("Case stmts out of sync!"); 2260 case Intrinsic::x86_avx512_add_ps_512: 2261 case Intrinsic::x86_avx512_add_pd_512: 2262 V = IC.Builder.CreateFAdd(Arg0, Arg1); 2263 break; 2264 case Intrinsic::x86_avx512_sub_ps_512: 2265 case Intrinsic::x86_avx512_sub_pd_512: 2266 V = IC.Builder.CreateFSub(Arg0, Arg1); 2267 break; 2268 case Intrinsic::x86_avx512_mul_ps_512: 2269 case Intrinsic::x86_avx512_mul_pd_512: 2270 V = IC.Builder.CreateFMul(Arg0, Arg1); 2271 break; 2272 case Intrinsic::x86_avx512_div_ps_512: 2273 case Intrinsic::x86_avx512_div_pd_512: 2274 V = IC.Builder.CreateFDiv(Arg0, Arg1); 2275 break; 2276 } 2277 2278 return IC.replaceInstUsesWith(II, V); 2279 } 2280 } 2281 break; 2282 2283 case Intrinsic::x86_avx512_mask_add_ss_round: 2284 case Intrinsic::x86_avx512_mask_div_ss_round: 2285 case Intrinsic::x86_avx512_mask_mul_ss_round: 2286 case Intrinsic::x86_avx512_mask_sub_ss_round: 2287 case Intrinsic::x86_avx512_mask_add_sd_round: 2288 case Intrinsic::x86_avx512_mask_div_sd_round: 2289 case Intrinsic::x86_avx512_mask_mul_sd_round: 2290 case Intrinsic::x86_avx512_mask_sub_sd_round: 2291 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2292 // IR operations. 2293 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 2294 if (R->getValue() == 4) { 2295 // Extract the element as scalars. 2296 Value *Arg0 = II.getArgOperand(0); 2297 Value *Arg1 = II.getArgOperand(1); 2298 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 2299 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 2300 2301 Value *V; 2302 switch (IID) { 2303 default: 2304 llvm_unreachable("Case stmts out of sync!"); 2305 case Intrinsic::x86_avx512_mask_add_ss_round: 2306 case Intrinsic::x86_avx512_mask_add_sd_round: 2307 V = IC.Builder.CreateFAdd(LHS, RHS); 2308 break; 2309 case Intrinsic::x86_avx512_mask_sub_ss_round: 2310 case Intrinsic::x86_avx512_mask_sub_sd_round: 2311 V = IC.Builder.CreateFSub(LHS, RHS); 2312 break; 2313 case Intrinsic::x86_avx512_mask_mul_ss_round: 2314 case Intrinsic::x86_avx512_mask_mul_sd_round: 2315 V = IC.Builder.CreateFMul(LHS, RHS); 2316 break; 2317 case Intrinsic::x86_avx512_mask_div_ss_round: 2318 case Intrinsic::x86_avx512_mask_div_sd_round: 2319 V = IC.Builder.CreateFDiv(LHS, RHS); 2320 break; 2321 } 2322 2323 // Handle the masking aspect of the intrinsic. 2324 Value *Mask = II.getArgOperand(3); 2325 auto *C = dyn_cast<ConstantInt>(Mask); 2326 // We don't need a select if we know the mask bit is a 1. 2327 if (!C || !C->getValue()[0]) { 2328 // Cast the mask to an i1 vector and then extract the lowest element. 2329 auto *MaskTy = FixedVectorType::get( 2330 IC.Builder.getInt1Ty(), 2331 cast<IntegerType>(Mask->getType())->getBitWidth()); 2332 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 2333 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 2334 // Extract the lowest element from the passthru operand. 2335 Value *Passthru = 2336 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 2337 V = IC.Builder.CreateSelect(Mask, V, Passthru); 2338 } 2339 2340 // Insert the result back into the original argument 0. 2341 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2342 2343 return IC.replaceInstUsesWith(II, V); 2344 } 2345 } 2346 break; 2347 2348 // Constant fold ashr( <A x Bi>, Ci ). 2349 // Constant fold lshr( <A x Bi>, Ci ). 2350 // Constant fold shl( <A x Bi>, Ci ). 2351 case Intrinsic::x86_sse2_psrai_d: 2352 case Intrinsic::x86_sse2_psrai_w: 2353 case Intrinsic::x86_avx2_psrai_d: 2354 case Intrinsic::x86_avx2_psrai_w: 2355 case Intrinsic::x86_avx512_psrai_q_128: 2356 case Intrinsic::x86_avx512_psrai_q_256: 2357 case Intrinsic::x86_avx512_psrai_d_512: 2358 case Intrinsic::x86_avx512_psrai_q_512: 2359 case Intrinsic::x86_avx512_psrai_w_512: 2360 case Intrinsic::x86_sse2_psrli_d: 2361 case Intrinsic::x86_sse2_psrli_q: 2362 case Intrinsic::x86_sse2_psrli_w: 2363 case Intrinsic::x86_avx2_psrli_d: 2364 case Intrinsic::x86_avx2_psrli_q: 2365 case Intrinsic::x86_avx2_psrli_w: 2366 case Intrinsic::x86_avx512_psrli_d_512: 2367 case Intrinsic::x86_avx512_psrli_q_512: 2368 case Intrinsic::x86_avx512_psrli_w_512: 2369 case Intrinsic::x86_sse2_pslli_d: 2370 case Intrinsic::x86_sse2_pslli_q: 2371 case Intrinsic::x86_sse2_pslli_w: 2372 case Intrinsic::x86_avx2_pslli_d: 2373 case Intrinsic::x86_avx2_pslli_q: 2374 case Intrinsic::x86_avx2_pslli_w: 2375 case Intrinsic::x86_avx512_pslli_d_512: 2376 case Intrinsic::x86_avx512_pslli_q_512: 2377 case Intrinsic::x86_avx512_pslli_w_512: 2378 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2379 return IC.replaceInstUsesWith(II, V); 2380 } 2381 break; 2382 2383 case Intrinsic::x86_sse2_psra_d: 2384 case Intrinsic::x86_sse2_psra_w: 2385 case Intrinsic::x86_avx2_psra_d: 2386 case Intrinsic::x86_avx2_psra_w: 2387 case Intrinsic::x86_avx512_psra_q_128: 2388 case Intrinsic::x86_avx512_psra_q_256: 2389 case Intrinsic::x86_avx512_psra_d_512: 2390 case Intrinsic::x86_avx512_psra_q_512: 2391 case Intrinsic::x86_avx512_psra_w_512: 2392 case Intrinsic::x86_sse2_psrl_d: 2393 case Intrinsic::x86_sse2_psrl_q: 2394 case Intrinsic::x86_sse2_psrl_w: 2395 case Intrinsic::x86_avx2_psrl_d: 2396 case Intrinsic::x86_avx2_psrl_q: 2397 case Intrinsic::x86_avx2_psrl_w: 2398 case Intrinsic::x86_avx512_psrl_d_512: 2399 case Intrinsic::x86_avx512_psrl_q_512: 2400 case Intrinsic::x86_avx512_psrl_w_512: 2401 case Intrinsic::x86_sse2_psll_d: 2402 case Intrinsic::x86_sse2_psll_q: 2403 case Intrinsic::x86_sse2_psll_w: 2404 case Intrinsic::x86_avx2_psll_d: 2405 case Intrinsic::x86_avx2_psll_q: 2406 case Intrinsic::x86_avx2_psll_w: 2407 case Intrinsic::x86_avx512_psll_d_512: 2408 case Intrinsic::x86_avx512_psll_q_512: 2409 case Intrinsic::x86_avx512_psll_w_512: { 2410 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2411 return IC.replaceInstUsesWith(II, V); 2412 } 2413 2414 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2415 // operand to compute the shift amount. 2416 Value *Arg1 = II.getArgOperand(1); 2417 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2418 "Unexpected packed shift size"); 2419 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 2420 2421 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2422 return IC.replaceOperand(II, 1, V); 2423 } 2424 break; 2425 } 2426 2427 case Intrinsic::x86_avx2_psllv_d: 2428 case Intrinsic::x86_avx2_psllv_d_256: 2429 case Intrinsic::x86_avx2_psllv_q: 2430 case Intrinsic::x86_avx2_psllv_q_256: 2431 case Intrinsic::x86_avx512_psllv_d_512: 2432 case Intrinsic::x86_avx512_psllv_q_512: 2433 case Intrinsic::x86_avx512_psllv_w_128: 2434 case Intrinsic::x86_avx512_psllv_w_256: 2435 case Intrinsic::x86_avx512_psllv_w_512: 2436 case Intrinsic::x86_avx2_psrav_d: 2437 case Intrinsic::x86_avx2_psrav_d_256: 2438 case Intrinsic::x86_avx512_psrav_q_128: 2439 case Intrinsic::x86_avx512_psrav_q_256: 2440 case Intrinsic::x86_avx512_psrav_d_512: 2441 case Intrinsic::x86_avx512_psrav_q_512: 2442 case Intrinsic::x86_avx512_psrav_w_128: 2443 case Intrinsic::x86_avx512_psrav_w_256: 2444 case Intrinsic::x86_avx512_psrav_w_512: 2445 case Intrinsic::x86_avx2_psrlv_d: 2446 case Intrinsic::x86_avx2_psrlv_d_256: 2447 case Intrinsic::x86_avx2_psrlv_q: 2448 case Intrinsic::x86_avx2_psrlv_q_256: 2449 case Intrinsic::x86_avx512_psrlv_d_512: 2450 case Intrinsic::x86_avx512_psrlv_q_512: 2451 case Intrinsic::x86_avx512_psrlv_w_128: 2452 case Intrinsic::x86_avx512_psrlv_w_256: 2453 case Intrinsic::x86_avx512_psrlv_w_512: 2454 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 2455 return IC.replaceInstUsesWith(II, V); 2456 } 2457 break; 2458 2459 case Intrinsic::x86_sse2_packssdw_128: 2460 case Intrinsic::x86_sse2_packsswb_128: 2461 case Intrinsic::x86_avx2_packssdw: 2462 case Intrinsic::x86_avx2_packsswb: 2463 case Intrinsic::x86_avx512_packssdw_512: 2464 case Intrinsic::x86_avx512_packsswb_512: 2465 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 2466 return IC.replaceInstUsesWith(II, V); 2467 } 2468 break; 2469 2470 case Intrinsic::x86_sse2_packuswb_128: 2471 case Intrinsic::x86_sse41_packusdw: 2472 case Intrinsic::x86_avx2_packusdw: 2473 case Intrinsic::x86_avx2_packuswb: 2474 case Intrinsic::x86_avx512_packusdw_512: 2475 case Intrinsic::x86_avx512_packuswb_512: 2476 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 2477 return IC.replaceInstUsesWith(II, V); 2478 } 2479 break; 2480 2481 case Intrinsic::x86_pclmulqdq: 2482 case Intrinsic::x86_pclmulqdq_256: 2483 case Intrinsic::x86_pclmulqdq_512: { 2484 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2485 unsigned Imm = C->getZExtValue(); 2486 2487 bool MadeChange = false; 2488 Value *Arg0 = II.getArgOperand(0); 2489 Value *Arg1 = II.getArgOperand(1); 2490 unsigned VWidth = 2491 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2492 2493 APInt UndefElts1(VWidth, 0); 2494 APInt DemandedElts1 = 2495 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 2496 if (Value *V = 2497 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 2498 IC.replaceOperand(II, 0, V); 2499 MadeChange = true; 2500 } 2501 2502 APInt UndefElts2(VWidth, 0); 2503 APInt DemandedElts2 = 2504 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 2505 if (Value *V = 2506 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 2507 IC.replaceOperand(II, 1, V); 2508 MadeChange = true; 2509 } 2510 2511 // If either input elements are undef, the result is zero. 2512 if (DemandedElts1.isSubsetOf(UndefElts1) || 2513 DemandedElts2.isSubsetOf(UndefElts2)) { 2514 return IC.replaceInstUsesWith(II, 2515 ConstantAggregateZero::get(II.getType())); 2516 } 2517 2518 if (MadeChange) { 2519 return &II; 2520 } 2521 } 2522 break; 2523 } 2524 2525 case Intrinsic::x86_sse41_insertps: 2526 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 2527 return IC.replaceInstUsesWith(II, V); 2528 } 2529 break; 2530 2531 case Intrinsic::x86_sse4a_extrq: { 2532 Value *Op0 = II.getArgOperand(0); 2533 Value *Op1 = II.getArgOperand(1); 2534 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2535 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2536 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2537 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2538 VWidth1 == 16 && "Unexpected operand sizes"); 2539 2540 // See if we're dealing with constant values. 2541 auto *C1 = dyn_cast<Constant>(Op1); 2542 auto *CILength = 2543 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2544 : nullptr; 2545 auto *CIIndex = 2546 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2547 : nullptr; 2548 2549 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2550 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2551 return IC.replaceInstUsesWith(II, V); 2552 } 2553 2554 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2555 // operands and the lowest 16-bits of the second. 2556 bool MadeChange = false; 2557 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2558 IC.replaceOperand(II, 0, V); 2559 MadeChange = true; 2560 } 2561 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2562 IC.replaceOperand(II, 1, V); 2563 MadeChange = true; 2564 } 2565 if (MadeChange) { 2566 return &II; 2567 } 2568 break; 2569 } 2570 2571 case Intrinsic::x86_sse4a_extrqi: { 2572 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2573 // bits of the lower 64-bits. The upper 64-bits are undefined. 2574 Value *Op0 = II.getArgOperand(0); 2575 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2576 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2577 "Unexpected operand size"); 2578 2579 // See if we're dealing with constant values. 2580 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 2581 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2582 2583 // Attempt to simplify to a constant or shuffle vector. 2584 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2585 return IC.replaceInstUsesWith(II, V); 2586 } 2587 2588 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2589 // operand. 2590 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2591 return IC.replaceOperand(II, 0, V); 2592 } 2593 break; 2594 } 2595 2596 case Intrinsic::x86_sse4a_insertq: { 2597 Value *Op0 = II.getArgOperand(0); 2598 Value *Op1 = II.getArgOperand(1); 2599 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2600 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2601 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2602 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 2603 "Unexpected operand size"); 2604 2605 // See if we're dealing with constant values. 2606 auto *C1 = dyn_cast<Constant>(Op1); 2607 auto *CI11 = 2608 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2609 : nullptr; 2610 2611 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2612 if (CI11) { 2613 const APInt &V11 = CI11->getValue(); 2614 APInt Len = V11.zextOrTrunc(6); 2615 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2616 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2617 return IC.replaceInstUsesWith(II, V); 2618 } 2619 } 2620 2621 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2622 // operand. 2623 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2624 return IC.replaceOperand(II, 0, V); 2625 } 2626 break; 2627 } 2628 2629 case Intrinsic::x86_sse4a_insertqi: { 2630 // INSERTQI: Extract lowest Length bits from lower half of second source and 2631 // insert over first source starting at Index bit. The upper 64-bits are 2632 // undefined. 2633 Value *Op0 = II.getArgOperand(0); 2634 Value *Op1 = II.getArgOperand(1); 2635 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2636 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2637 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2638 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2639 VWidth1 == 2 && "Unexpected operand sizes"); 2640 2641 // See if we're dealing with constant values. 2642 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2643 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 2644 2645 // Attempt to simplify to a constant or shuffle vector. 2646 if (CILength && CIIndex) { 2647 APInt Len = CILength->getValue().zextOrTrunc(6); 2648 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 2649 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2650 return IC.replaceInstUsesWith(II, V); 2651 } 2652 } 2653 2654 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 2655 // operands. 2656 bool MadeChange = false; 2657 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2658 IC.replaceOperand(II, 0, V); 2659 MadeChange = true; 2660 } 2661 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 2662 IC.replaceOperand(II, 1, V); 2663 MadeChange = true; 2664 } 2665 if (MadeChange) { 2666 return &II; 2667 } 2668 break; 2669 } 2670 2671 case Intrinsic::x86_sse41_pblendvb: 2672 case Intrinsic::x86_sse41_blendvps: 2673 case Intrinsic::x86_sse41_blendvpd: 2674 case Intrinsic::x86_avx_blendv_ps_256: 2675 case Intrinsic::x86_avx_blendv_pd_256: 2676 case Intrinsic::x86_avx2_pblendvb: { 2677 // fold (blend A, A, Mask) -> A 2678 Value *Op0 = II.getArgOperand(0); 2679 Value *Op1 = II.getArgOperand(1); 2680 Value *Mask = II.getArgOperand(2); 2681 if (Op0 == Op1) { 2682 return IC.replaceInstUsesWith(II, Op0); 2683 } 2684 2685 // Zero Mask - select 1st argument. 2686 if (isa<ConstantAggregateZero>(Mask)) { 2687 return IC.replaceInstUsesWith(II, Op0); 2688 } 2689 2690 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 2691 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 2692 Constant *NewSelector = 2693 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout()); 2694 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 2695 } 2696 2697 // Convert to a vector select if we can bypass casts and find a boolean 2698 // vector condition value. 2699 Value *BoolVec; 2700 Mask = InstCombiner::peekThroughBitcast(Mask); 2701 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 2702 BoolVec->getType()->isVectorTy() && 2703 BoolVec->getType()->getScalarSizeInBits() == 1) { 2704 auto *MaskTy = cast<FixedVectorType>(Mask->getType()); 2705 auto *OpTy = cast<FixedVectorType>(II.getType()); 2706 assert(MaskTy->getPrimitiveSizeInBits() == 2707 OpTy->getPrimitiveSizeInBits() && 2708 "Not expecting mask and operands with different sizes"); 2709 unsigned NumMaskElts = MaskTy->getNumElements(); 2710 unsigned NumOperandElts = OpTy->getNumElements(); 2711 2712 if (NumMaskElts == NumOperandElts) { 2713 return SelectInst::Create(BoolVec, Op1, Op0); 2714 } 2715 2716 // If the mask has less elements than the operands, each mask bit maps to 2717 // multiple elements of the operands. Bitcast back and forth. 2718 if (NumMaskElts < NumOperandElts) { 2719 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy); 2720 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy); 2721 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 2722 return new BitCastInst(Sel, II.getType()); 2723 } 2724 } 2725 2726 break; 2727 } 2728 2729 case Intrinsic::x86_ssse3_pshuf_b_128: 2730 case Intrinsic::x86_avx2_pshuf_b: 2731 case Intrinsic::x86_avx512_pshuf_b_512: 2732 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 2733 return IC.replaceInstUsesWith(II, V); 2734 } 2735 break; 2736 2737 case Intrinsic::x86_avx_vpermilvar_ps: 2738 case Intrinsic::x86_avx_vpermilvar_ps_256: 2739 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2740 case Intrinsic::x86_avx_vpermilvar_pd: 2741 case Intrinsic::x86_avx_vpermilvar_pd_256: 2742 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2743 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 2744 return IC.replaceInstUsesWith(II, V); 2745 } 2746 break; 2747 2748 case Intrinsic::x86_avx2_permd: 2749 case Intrinsic::x86_avx2_permps: 2750 case Intrinsic::x86_avx512_permvar_df_256: 2751 case Intrinsic::x86_avx512_permvar_df_512: 2752 case Intrinsic::x86_avx512_permvar_di_256: 2753 case Intrinsic::x86_avx512_permvar_di_512: 2754 case Intrinsic::x86_avx512_permvar_hi_128: 2755 case Intrinsic::x86_avx512_permvar_hi_256: 2756 case Intrinsic::x86_avx512_permvar_hi_512: 2757 case Intrinsic::x86_avx512_permvar_qi_128: 2758 case Intrinsic::x86_avx512_permvar_qi_256: 2759 case Intrinsic::x86_avx512_permvar_qi_512: 2760 case Intrinsic::x86_avx512_permvar_sf_512: 2761 case Intrinsic::x86_avx512_permvar_si_512: 2762 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 2763 return IC.replaceInstUsesWith(II, V); 2764 } 2765 break; 2766 2767 case Intrinsic::x86_avx_maskload_ps: 2768 case Intrinsic::x86_avx_maskload_pd: 2769 case Intrinsic::x86_avx_maskload_ps_256: 2770 case Intrinsic::x86_avx_maskload_pd_256: 2771 case Intrinsic::x86_avx2_maskload_d: 2772 case Intrinsic::x86_avx2_maskload_q: 2773 case Intrinsic::x86_avx2_maskload_d_256: 2774 case Intrinsic::x86_avx2_maskload_q_256: 2775 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 2776 return I; 2777 } 2778 break; 2779 2780 case Intrinsic::x86_sse2_maskmov_dqu: 2781 case Intrinsic::x86_avx_maskstore_ps: 2782 case Intrinsic::x86_avx_maskstore_pd: 2783 case Intrinsic::x86_avx_maskstore_ps_256: 2784 case Intrinsic::x86_avx_maskstore_pd_256: 2785 case Intrinsic::x86_avx2_maskstore_d: 2786 case Intrinsic::x86_avx2_maskstore_q: 2787 case Intrinsic::x86_avx2_maskstore_d_256: 2788 case Intrinsic::x86_avx2_maskstore_q_256: 2789 if (simplifyX86MaskedStore(II, IC)) { 2790 return nullptr; 2791 } 2792 break; 2793 2794 case Intrinsic::x86_addcarry_32: 2795 case Intrinsic::x86_addcarry_64: 2796 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 2797 return IC.replaceInstUsesWith(II, V); 2798 } 2799 break; 2800 2801 case Intrinsic::x86_avx512_pternlog_d_128: 2802 case Intrinsic::x86_avx512_pternlog_d_256: 2803 case Intrinsic::x86_avx512_pternlog_d_512: 2804 case Intrinsic::x86_avx512_pternlog_q_128: 2805 case Intrinsic::x86_avx512_pternlog_q_256: 2806 case Intrinsic::x86_avx512_pternlog_q_512: 2807 if (Value *V = simplifyTernarylogic(II, IC.Builder)) { 2808 return IC.replaceInstUsesWith(II, V); 2809 } 2810 break; 2811 default: 2812 break; 2813 } 2814 return std::nullopt; 2815 } 2816 2817 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 2818 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 2819 bool &KnownBitsComputed) const { 2820 switch (II.getIntrinsicID()) { 2821 default: 2822 break; 2823 case Intrinsic::x86_mmx_pmovmskb: 2824 case Intrinsic::x86_sse_movmsk_ps: 2825 case Intrinsic::x86_sse2_movmsk_pd: 2826 case Intrinsic::x86_sse2_pmovmskb_128: 2827 case Intrinsic::x86_avx_movmsk_ps_256: 2828 case Intrinsic::x86_avx_movmsk_pd_256: 2829 case Intrinsic::x86_avx2_pmovmskb: { 2830 // MOVMSK copies the vector elements' sign bits to the low bits 2831 // and zeros the high bits. 2832 unsigned ArgWidth; 2833 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 2834 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 2835 } else { 2836 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 2837 ArgWidth = ArgType->getNumElements(); 2838 } 2839 2840 // If we don't need any of low bits then return zero, 2841 // we know that DemandedMask is non-zero already. 2842 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 2843 Type *VTy = II.getType(); 2844 if (DemandedElts.isZero()) { 2845 return ConstantInt::getNullValue(VTy); 2846 } 2847 2848 // We know that the upper bits are set to zero. 2849 Known.Zero.setBitsFrom(ArgWidth); 2850 KnownBitsComputed = true; 2851 break; 2852 } 2853 } 2854 return std::nullopt; 2855 } 2856 2857 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 2858 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 2859 APInt &UndefElts2, APInt &UndefElts3, 2860 std::function<void(Instruction *, unsigned, APInt, APInt &)> 2861 simplifyAndSetOp) const { 2862 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 2863 switch (II.getIntrinsicID()) { 2864 default: 2865 break; 2866 case Intrinsic::x86_xop_vfrcz_ss: 2867 case Intrinsic::x86_xop_vfrcz_sd: 2868 // The instructions for these intrinsics are speced to zero upper bits not 2869 // pass them through like other scalar intrinsics. So we shouldn't just 2870 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 2871 // Instead we should return a zero vector. 2872 if (!DemandedElts[0]) { 2873 IC.addToWorklist(&II); 2874 return ConstantAggregateZero::get(II.getType()); 2875 } 2876 2877 // Only the lower element is used. 2878 DemandedElts = 1; 2879 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2880 2881 // Only the lower element is undefined. The high elements are zero. 2882 UndefElts = UndefElts[0]; 2883 break; 2884 2885 // Unary scalar-as-vector operations that work column-wise. 2886 case Intrinsic::x86_sse_rcp_ss: 2887 case Intrinsic::x86_sse_rsqrt_ss: 2888 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2889 2890 // If lowest element of a scalar op isn't used then use Arg0. 2891 if (!DemandedElts[0]) { 2892 IC.addToWorklist(&II); 2893 return II.getArgOperand(0); 2894 } 2895 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 2896 // checks). 2897 break; 2898 2899 // Binary scalar-as-vector operations that work column-wise. The high 2900 // elements come from operand 0. The low element is a function of both 2901 // operands. 2902 case Intrinsic::x86_sse_min_ss: 2903 case Intrinsic::x86_sse_max_ss: 2904 case Intrinsic::x86_sse_cmp_ss: 2905 case Intrinsic::x86_sse2_min_sd: 2906 case Intrinsic::x86_sse2_max_sd: 2907 case Intrinsic::x86_sse2_cmp_sd: { 2908 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2909 2910 // If lowest element of a scalar op isn't used then use Arg0. 2911 if (!DemandedElts[0]) { 2912 IC.addToWorklist(&II); 2913 return II.getArgOperand(0); 2914 } 2915 2916 // Only lower element is used for operand 1. 2917 DemandedElts = 1; 2918 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 2919 2920 // Lower element is undefined if both lower elements are undefined. 2921 // Consider things like undef&0. The result is known zero, not undef. 2922 if (!UndefElts2[0]) 2923 UndefElts.clearBit(0); 2924 2925 break; 2926 } 2927 2928 // Binary scalar-as-vector operations that work column-wise. The high 2929 // elements come from operand 0 and the low element comes from operand 1. 2930 case Intrinsic::x86_sse41_round_ss: 2931 case Intrinsic::x86_sse41_round_sd: { 2932 // Don't use the low element of operand 0. 2933 APInt DemandedElts2 = DemandedElts; 2934 DemandedElts2.clearBit(0); 2935 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 2936 2937 // If lowest element of a scalar op isn't used then use Arg0. 2938 if (!DemandedElts[0]) { 2939 IC.addToWorklist(&II); 2940 return II.getArgOperand(0); 2941 } 2942 2943 // Only lower element is used for operand 1. 2944 DemandedElts = 1; 2945 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 2946 2947 // Take the high undef elements from operand 0 and take the lower element 2948 // from operand 1. 2949 UndefElts.clearBit(0); 2950 UndefElts |= UndefElts2[0]; 2951 break; 2952 } 2953 2954 // Three input scalar-as-vector operations that work column-wise. The high 2955 // elements come from operand 0 and the low element is a function of all 2956 // three inputs. 2957 case Intrinsic::x86_avx512_mask_add_ss_round: 2958 case Intrinsic::x86_avx512_mask_div_ss_round: 2959 case Intrinsic::x86_avx512_mask_mul_ss_round: 2960 case Intrinsic::x86_avx512_mask_sub_ss_round: 2961 case Intrinsic::x86_avx512_mask_max_ss_round: 2962 case Intrinsic::x86_avx512_mask_min_ss_round: 2963 case Intrinsic::x86_avx512_mask_add_sd_round: 2964 case Intrinsic::x86_avx512_mask_div_sd_round: 2965 case Intrinsic::x86_avx512_mask_mul_sd_round: 2966 case Intrinsic::x86_avx512_mask_sub_sd_round: 2967 case Intrinsic::x86_avx512_mask_max_sd_round: 2968 case Intrinsic::x86_avx512_mask_min_sd_round: 2969 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2970 2971 // If lowest element of a scalar op isn't used then use Arg0. 2972 if (!DemandedElts[0]) { 2973 IC.addToWorklist(&II); 2974 return II.getArgOperand(0); 2975 } 2976 2977 // Only lower element is used for operand 1 and 2. 2978 DemandedElts = 1; 2979 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 2980 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 2981 2982 // Lower element is undefined if all three lower elements are undefined. 2983 // Consider things like undef&0. The result is known zero, not undef. 2984 if (!UndefElts2[0] || !UndefElts3[0]) 2985 UndefElts.clearBit(0); 2986 break; 2987 2988 // TODO: Add fmaddsub support? 2989 case Intrinsic::x86_sse3_addsub_pd: 2990 case Intrinsic::x86_sse3_addsub_ps: 2991 case Intrinsic::x86_avx_addsub_pd_256: 2992 case Intrinsic::x86_avx_addsub_ps_256: { 2993 // If none of the even or none of the odd lanes are required, turn this 2994 // into a generic FP math instruction. 2995 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 2996 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 2997 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 2998 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 2999 if (IsSubOnly || IsAddOnly) { 3000 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 3001 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 3002 IC.Builder.SetInsertPoint(&II); 3003 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 3004 return IC.Builder.CreateBinOp( 3005 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 3006 } 3007 3008 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3009 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3010 UndefElts &= UndefElts2; 3011 break; 3012 } 3013 3014 // General per-element vector operations. 3015 case Intrinsic::x86_avx2_psllv_d: 3016 case Intrinsic::x86_avx2_psllv_d_256: 3017 case Intrinsic::x86_avx2_psllv_q: 3018 case Intrinsic::x86_avx2_psllv_q_256: 3019 case Intrinsic::x86_avx2_psrlv_d: 3020 case Intrinsic::x86_avx2_psrlv_d_256: 3021 case Intrinsic::x86_avx2_psrlv_q: 3022 case Intrinsic::x86_avx2_psrlv_q_256: 3023 case Intrinsic::x86_avx2_psrav_d: 3024 case Intrinsic::x86_avx2_psrav_d_256: { 3025 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3026 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3027 UndefElts &= UndefElts2; 3028 break; 3029 } 3030 3031 case Intrinsic::x86_sse2_packssdw_128: 3032 case Intrinsic::x86_sse2_packsswb_128: 3033 case Intrinsic::x86_sse2_packuswb_128: 3034 case Intrinsic::x86_sse41_packusdw: 3035 case Intrinsic::x86_avx2_packssdw: 3036 case Intrinsic::x86_avx2_packsswb: 3037 case Intrinsic::x86_avx2_packusdw: 3038 case Intrinsic::x86_avx2_packuswb: 3039 case Intrinsic::x86_avx512_packssdw_512: 3040 case Intrinsic::x86_avx512_packsswb_512: 3041 case Intrinsic::x86_avx512_packusdw_512: 3042 case Intrinsic::x86_avx512_packuswb_512: { 3043 auto *Ty0 = II.getArgOperand(0)->getType(); 3044 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 3045 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 3046 3047 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 3048 unsigned VWidthPerLane = VWidth / NumLanes; 3049 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 3050 3051 // Per lane, pack the elements of the first input and then the second. 3052 // e.g. 3053 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 3054 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 3055 for (int OpNum = 0; OpNum != 2; ++OpNum) { 3056 APInt OpDemandedElts(InnerVWidth, 0); 3057 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3058 unsigned LaneIdx = Lane * VWidthPerLane; 3059 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 3060 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 3061 if (DemandedElts[Idx]) 3062 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 3063 } 3064 } 3065 3066 // Demand elements from the operand. 3067 APInt OpUndefElts(InnerVWidth, 0); 3068 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 3069 3070 // Pack the operand's UNDEF elements, one lane at a time. 3071 OpUndefElts = OpUndefElts.zext(VWidth); 3072 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3073 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 3074 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 3075 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 3076 UndefElts |= LaneElts; 3077 } 3078 } 3079 break; 3080 } 3081 3082 case Intrinsic::x86_sse2_pmadd_wd: 3083 case Intrinsic::x86_avx2_pmadd_wd: 3084 case Intrinsic::x86_avx512_pmaddw_d_512: 3085 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 3086 case Intrinsic::x86_avx2_pmadd_ub_sw: 3087 case Intrinsic::x86_avx512_pmaddubs_w_512: { 3088 // PMADD - demand both src elements that map to each dst element. 3089 auto *ArgTy = II.getArgOperand(0)->getType(); 3090 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements(); 3091 assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); 3092 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth); 3093 APInt Op0UndefElts(InnerVWidth, 0); 3094 APInt Op1UndefElts(InnerVWidth, 0); 3095 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts); 3096 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts); 3097 break; 3098 } 3099 3100 // PSHUFB 3101 case Intrinsic::x86_ssse3_pshuf_b_128: 3102 case Intrinsic::x86_avx2_pshuf_b: 3103 case Intrinsic::x86_avx512_pshuf_b_512: 3104 // PERMILVAR 3105 case Intrinsic::x86_avx_vpermilvar_ps: 3106 case Intrinsic::x86_avx_vpermilvar_ps_256: 3107 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3108 case Intrinsic::x86_avx_vpermilvar_pd: 3109 case Intrinsic::x86_avx_vpermilvar_pd_256: 3110 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3111 // PERMV 3112 case Intrinsic::x86_avx2_permd: 3113 case Intrinsic::x86_avx2_permps: { 3114 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 3115 break; 3116 } 3117 3118 // SSE4A instructions leave the upper 64-bits of the 128-bit result 3119 // in an undefined state. 3120 case Intrinsic::x86_sse4a_extrq: 3121 case Intrinsic::x86_sse4a_extrqi: 3122 case Intrinsic::x86_sse4a_insertq: 3123 case Intrinsic::x86_sse4a_insertqi: 3124 UndefElts.setHighBits(VWidth / 2); 3125 break; 3126 } 3127 return std::nullopt; 3128 } 3129