1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Transforms/InstCombine/InstCombiner.h" 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "x86tti" 24 25 /// Return a constant boolean vector that has true elements in all positions 26 /// where the input constant data vector has an element with the sign bit set. 27 static Constant *getNegativeIsTrueBoolVec(Constant *V) { 28 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 29 V = ConstantExpr::getBitCast(V, IntTy); 30 V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy), 31 V); 32 return V; 33 } 34 35 /// Convert the x86 XMM integer vector mask to a vector of bools based on 36 /// each element's most significant bit (the sign bit). 37 static Value *getBoolVecFromMask(Value *Mask) { 38 // Fold Constant Mask. 39 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 40 return getNegativeIsTrueBoolVec(ConstantMask); 41 42 // Mask was extended from a boolean vector. 43 Value *ExtMask; 44 if (PatternMatch::match( 45 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 46 ExtMask->getType()->isIntOrIntVectorTy(1)) 47 return ExtMask; 48 49 return nullptr; 50 } 51 52 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 53 // XMM register mask efficiently, we could transform all x86 masked intrinsics 54 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 55 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 56 Value *Ptr = II.getOperand(0); 57 Value *Mask = II.getOperand(1); 58 Constant *ZeroVec = Constant::getNullValue(II.getType()); 59 60 // Zero Mask - masked load instruction creates a zero vector. 61 if (isa<ConstantAggregateZero>(Mask)) 62 return IC.replaceInstUsesWith(II, ZeroVec); 63 64 // The mask is constant or extended from a bool vector. Convert this x86 65 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 66 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 67 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 68 // the LLVM intrinsic definition for the pointer argument. 69 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 70 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 71 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 72 73 // The pass-through vector for an x86 masked load is a zero vector. 74 CallInst *NewMaskedLoad = 75 IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec); 76 return IC.replaceInstUsesWith(II, NewMaskedLoad); 77 } 78 79 return nullptr; 80 } 81 82 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 83 // XMM register mask efficiently, we could transform all x86 masked intrinsics 84 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 85 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 86 Value *Ptr = II.getOperand(0); 87 Value *Mask = II.getOperand(1); 88 Value *Vec = II.getOperand(2); 89 90 // Zero Mask - this masked store instruction does nothing. 91 if (isa<ConstantAggregateZero>(Mask)) { 92 IC.eraseInstFromFunction(II); 93 return true; 94 } 95 96 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 97 // anything else at this level. 98 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 99 return false; 100 101 // The mask is constant or extended from a bool vector. Convert this x86 102 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 103 if (Value *BoolMask = getBoolVecFromMask(Mask)) { 104 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 105 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 106 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 107 108 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 109 110 // 'Replace uses' doesn't work for stores. Erase the original masked store. 111 IC.eraseInstFromFunction(II); 112 return true; 113 } 114 115 return false; 116 } 117 118 static Value *simplifyX86immShift(const IntrinsicInst &II, 119 InstCombiner::BuilderTy &Builder) { 120 bool LogicalShift = false; 121 bool ShiftLeft = false; 122 bool IsImm = false; 123 124 switch (II.getIntrinsicID()) { 125 default: 126 llvm_unreachable("Unexpected intrinsic!"); 127 case Intrinsic::x86_sse2_psrai_d: 128 case Intrinsic::x86_sse2_psrai_w: 129 case Intrinsic::x86_avx2_psrai_d: 130 case Intrinsic::x86_avx2_psrai_w: 131 case Intrinsic::x86_avx512_psrai_q_128: 132 case Intrinsic::x86_avx512_psrai_q_256: 133 case Intrinsic::x86_avx512_psrai_d_512: 134 case Intrinsic::x86_avx512_psrai_q_512: 135 case Intrinsic::x86_avx512_psrai_w_512: 136 IsImm = true; 137 LLVM_FALLTHROUGH; 138 case Intrinsic::x86_sse2_psra_d: 139 case Intrinsic::x86_sse2_psra_w: 140 case Intrinsic::x86_avx2_psra_d: 141 case Intrinsic::x86_avx2_psra_w: 142 case Intrinsic::x86_avx512_psra_q_128: 143 case Intrinsic::x86_avx512_psra_q_256: 144 case Intrinsic::x86_avx512_psra_d_512: 145 case Intrinsic::x86_avx512_psra_q_512: 146 case Intrinsic::x86_avx512_psra_w_512: 147 LogicalShift = false; 148 ShiftLeft = false; 149 break; 150 case Intrinsic::x86_sse2_psrli_d: 151 case Intrinsic::x86_sse2_psrli_q: 152 case Intrinsic::x86_sse2_psrli_w: 153 case Intrinsic::x86_avx2_psrli_d: 154 case Intrinsic::x86_avx2_psrli_q: 155 case Intrinsic::x86_avx2_psrli_w: 156 case Intrinsic::x86_avx512_psrli_d_512: 157 case Intrinsic::x86_avx512_psrli_q_512: 158 case Intrinsic::x86_avx512_psrli_w_512: 159 IsImm = true; 160 LLVM_FALLTHROUGH; 161 case Intrinsic::x86_sse2_psrl_d: 162 case Intrinsic::x86_sse2_psrl_q: 163 case Intrinsic::x86_sse2_psrl_w: 164 case Intrinsic::x86_avx2_psrl_d: 165 case Intrinsic::x86_avx2_psrl_q: 166 case Intrinsic::x86_avx2_psrl_w: 167 case Intrinsic::x86_avx512_psrl_d_512: 168 case Intrinsic::x86_avx512_psrl_q_512: 169 case Intrinsic::x86_avx512_psrl_w_512: 170 LogicalShift = true; 171 ShiftLeft = false; 172 break; 173 case Intrinsic::x86_sse2_pslli_d: 174 case Intrinsic::x86_sse2_pslli_q: 175 case Intrinsic::x86_sse2_pslli_w: 176 case Intrinsic::x86_avx2_pslli_d: 177 case Intrinsic::x86_avx2_pslli_q: 178 case Intrinsic::x86_avx2_pslli_w: 179 case Intrinsic::x86_avx512_pslli_d_512: 180 case Intrinsic::x86_avx512_pslli_q_512: 181 case Intrinsic::x86_avx512_pslli_w_512: 182 IsImm = true; 183 LLVM_FALLTHROUGH; 184 case Intrinsic::x86_sse2_psll_d: 185 case Intrinsic::x86_sse2_psll_q: 186 case Intrinsic::x86_sse2_psll_w: 187 case Intrinsic::x86_avx2_psll_d: 188 case Intrinsic::x86_avx2_psll_q: 189 case Intrinsic::x86_avx2_psll_w: 190 case Intrinsic::x86_avx512_psll_d_512: 191 case Intrinsic::x86_avx512_psll_q_512: 192 case Intrinsic::x86_avx512_psll_w_512: 193 LogicalShift = true; 194 ShiftLeft = true; 195 break; 196 } 197 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 198 199 auto Vec = II.getArgOperand(0); 200 auto Amt = II.getArgOperand(1); 201 auto VT = cast<FixedVectorType>(Vec->getType()); 202 auto SVT = VT->getElementType(); 203 auto AmtVT = Amt->getType(); 204 unsigned VWidth = VT->getNumElements(); 205 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 206 207 // If the shift amount is guaranteed to be in-range we can replace it with a 208 // generic shift. If its guaranteed to be out of range, logical shifts combine 209 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 210 if (IsImm) { 211 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 212 KnownBits KnownAmtBits = 213 llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); 214 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 215 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 216 Amt = Builder.CreateVectorSplat(VWidth, Amt); 217 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 218 : Builder.CreateLShr(Vec, Amt)) 219 : Builder.CreateAShr(Vec, Amt)); 220 } 221 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 222 if (LogicalShift) 223 return ConstantAggregateZero::get(VT); 224 Amt = ConstantInt::get(SVT, BitWidth - 1); 225 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 226 } 227 } else { 228 // Ensure the first element has an in-range value and the rest of the 229 // elements in the bottom 64 bits are zero. 230 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 231 cast<VectorType>(AmtVT)->getElementType() == SVT && 232 "Unexpected shift-by-scalar type"); 233 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 234 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 235 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 236 KnownBits KnownLowerBits = llvm::computeKnownBits( 237 Amt, DemandedLower, II.getModule()->getDataLayout()); 238 KnownBits KnownUpperBits = llvm::computeKnownBits( 239 Amt, DemandedUpper, II.getModule()->getDataLayout()); 240 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 241 (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { 242 SmallVector<int, 16> ZeroSplat(VWidth, 0); 243 Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat); 244 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 245 : Builder.CreateLShr(Vec, Amt)) 246 : Builder.CreateAShr(Vec, Amt)); 247 } 248 } 249 250 // Simplify if count is constant vector. 251 auto CDV = dyn_cast<ConstantDataVector>(Amt); 252 if (!CDV) 253 return nullptr; 254 255 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 256 // operand to compute the shift amount. 257 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 258 cast<VectorType>(AmtVT)->getElementType() == SVT && 259 "Unexpected shift-by-scalar type"); 260 261 // Concatenate the sub-elements to create the 64-bit value. 262 APInt Count(64, 0); 263 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 264 unsigned SubEltIdx = (NumSubElts - 1) - i; 265 auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 266 Count <<= BitWidth; 267 Count |= SubElt->getValue().zextOrTrunc(64); 268 } 269 270 // If shift-by-zero then just return the original value. 271 if (Count.isNullValue()) 272 return Vec; 273 274 // Handle cases when Shift >= BitWidth. 275 if (Count.uge(BitWidth)) { 276 // If LogicalShift - just return zero. 277 if (LogicalShift) 278 return ConstantAggregateZero::get(VT); 279 280 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 281 Count = APInt(64, BitWidth - 1); 282 } 283 284 // Get a constant vector of the same type as the first operand. 285 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 286 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 287 288 if (ShiftLeft) 289 return Builder.CreateShl(Vec, ShiftVec); 290 291 if (LogicalShift) 292 return Builder.CreateLShr(Vec, ShiftVec); 293 294 return Builder.CreateAShr(Vec, ShiftVec); 295 } 296 297 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 298 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 299 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 300 static Value *simplifyX86varShift(const IntrinsicInst &II, 301 InstCombiner::BuilderTy &Builder) { 302 bool LogicalShift = false; 303 bool ShiftLeft = false; 304 305 switch (II.getIntrinsicID()) { 306 default: 307 llvm_unreachable("Unexpected intrinsic!"); 308 case Intrinsic::x86_avx2_psrav_d: 309 case Intrinsic::x86_avx2_psrav_d_256: 310 case Intrinsic::x86_avx512_psrav_q_128: 311 case Intrinsic::x86_avx512_psrav_q_256: 312 case Intrinsic::x86_avx512_psrav_d_512: 313 case Intrinsic::x86_avx512_psrav_q_512: 314 case Intrinsic::x86_avx512_psrav_w_128: 315 case Intrinsic::x86_avx512_psrav_w_256: 316 case Intrinsic::x86_avx512_psrav_w_512: 317 LogicalShift = false; 318 ShiftLeft = false; 319 break; 320 case Intrinsic::x86_avx2_psrlv_d: 321 case Intrinsic::x86_avx2_psrlv_d_256: 322 case Intrinsic::x86_avx2_psrlv_q: 323 case Intrinsic::x86_avx2_psrlv_q_256: 324 case Intrinsic::x86_avx512_psrlv_d_512: 325 case Intrinsic::x86_avx512_psrlv_q_512: 326 case Intrinsic::x86_avx512_psrlv_w_128: 327 case Intrinsic::x86_avx512_psrlv_w_256: 328 case Intrinsic::x86_avx512_psrlv_w_512: 329 LogicalShift = true; 330 ShiftLeft = false; 331 break; 332 case Intrinsic::x86_avx2_psllv_d: 333 case Intrinsic::x86_avx2_psllv_d_256: 334 case Intrinsic::x86_avx2_psllv_q: 335 case Intrinsic::x86_avx2_psllv_q_256: 336 case Intrinsic::x86_avx512_psllv_d_512: 337 case Intrinsic::x86_avx512_psllv_q_512: 338 case Intrinsic::x86_avx512_psllv_w_128: 339 case Intrinsic::x86_avx512_psllv_w_256: 340 case Intrinsic::x86_avx512_psllv_w_512: 341 LogicalShift = true; 342 ShiftLeft = true; 343 break; 344 } 345 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 346 347 auto Vec = II.getArgOperand(0); 348 auto Amt = II.getArgOperand(1); 349 auto VT = cast<FixedVectorType>(II.getType()); 350 auto SVT = VT->getElementType(); 351 int NumElts = VT->getNumElements(); 352 int BitWidth = SVT->getIntegerBitWidth(); 353 354 // If the shift amount is guaranteed to be in-range we can replace it with a 355 // generic shift. 356 APInt UpperBits = 357 APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth)); 358 if (llvm::MaskedValueIsZero(Amt, UpperBits, 359 II.getModule()->getDataLayout())) { 360 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 361 : Builder.CreateLShr(Vec, Amt)) 362 : Builder.CreateAShr(Vec, Amt)); 363 } 364 365 // Simplify if all shift amounts are constant/undef. 366 auto *CShift = dyn_cast<Constant>(Amt); 367 if (!CShift) 368 return nullptr; 369 370 // Collect each element's shift amount. 371 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 372 bool AnyOutOfRange = false; 373 SmallVector<int, 8> ShiftAmts; 374 for (int I = 0; I < NumElts; ++I) { 375 auto *CElt = CShift->getAggregateElement(I); 376 if (isa_and_nonnull<UndefValue>(CElt)) { 377 ShiftAmts.push_back(-1); 378 continue; 379 } 380 381 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 382 if (!COp) 383 return nullptr; 384 385 // Handle out of range shifts. 386 // If LogicalShift - set to BitWidth (special case). 387 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 388 APInt ShiftVal = COp->getValue(); 389 if (ShiftVal.uge(BitWidth)) { 390 AnyOutOfRange = LogicalShift; 391 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 392 continue; 393 } 394 395 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 396 } 397 398 // If all elements out of range or UNDEF, return vector of zeros/undefs. 399 // ArithmeticShift should only hit this if they are all UNDEF. 400 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 401 if (llvm::all_of(ShiftAmts, OutOfRange)) { 402 SmallVector<Constant *, 8> ConstantVec; 403 for (int Idx : ShiftAmts) { 404 if (Idx < 0) { 405 ConstantVec.push_back(UndefValue::get(SVT)); 406 } else { 407 assert(LogicalShift && "Logical shift expected"); 408 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 409 } 410 } 411 return ConstantVector::get(ConstantVec); 412 } 413 414 // We can't handle only some out of range values with generic logical shifts. 415 if (AnyOutOfRange) 416 return nullptr; 417 418 // Build the shift amount constant vector. 419 SmallVector<Constant *, 8> ShiftVecAmts; 420 for (int Idx : ShiftAmts) { 421 if (Idx < 0) 422 ShiftVecAmts.push_back(UndefValue::get(SVT)); 423 else 424 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 425 } 426 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 427 428 if (ShiftLeft) 429 return Builder.CreateShl(Vec, ShiftVec); 430 431 if (LogicalShift) 432 return Builder.CreateLShr(Vec, ShiftVec); 433 434 return Builder.CreateAShr(Vec, ShiftVec); 435 } 436 437 static Value *simplifyX86pack(IntrinsicInst &II, 438 InstCombiner::BuilderTy &Builder, bool IsSigned) { 439 Value *Arg0 = II.getArgOperand(0); 440 Value *Arg1 = II.getArgOperand(1); 441 Type *ResTy = II.getType(); 442 443 // Fast all undef handling. 444 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 445 return UndefValue::get(ResTy); 446 447 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 448 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 449 unsigned NumSrcElts = ArgTy->getNumElements(); 450 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 451 "Unexpected packing types"); 452 453 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 454 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 455 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 456 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 457 "Unexpected packing types"); 458 459 // Constant folding. 460 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 461 return nullptr; 462 463 // Clamp Values - signed/unsigned both use signed clamp values, but they 464 // differ on the min/max values. 465 APInt MinValue, MaxValue; 466 if (IsSigned) { 467 // PACKSS: Truncate signed value with signed saturation. 468 // Source values less than dst minint are saturated to minint. 469 // Source values greater than dst maxint are saturated to maxint. 470 MinValue = 471 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 472 MaxValue = 473 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 474 } else { 475 // PACKUS: Truncate signed value with unsigned saturation. 476 // Source values less than zero are saturated to zero. 477 // Source values greater than dst maxuint are saturated to maxuint. 478 MinValue = APInt::getNullValue(SrcScalarSizeInBits); 479 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 480 } 481 482 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 483 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 484 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 485 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 488 489 // Shuffle clamped args together at the lane level. 490 SmallVector<int, 32> PackMask; 491 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 492 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 493 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 496 } 497 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 498 499 // Truncate to dst size. 500 return Builder.CreateTrunc(Shuffle, ResTy); 501 } 502 503 static Value *simplifyX86movmsk(const IntrinsicInst &II, 504 InstCombiner::BuilderTy &Builder) { 505 Value *Arg = II.getArgOperand(0); 506 Type *ResTy = II.getType(); 507 508 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 509 if (isa<UndefValue>(Arg)) 510 return Constant::getNullValue(ResTy); 511 512 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 513 // We can't easily peek through x86_mmx types. 514 if (!ArgTy) 515 return nullptr; 516 517 // Expand MOVMSK to compare/bitcast/zext: 518 // e.g. PMOVMSKB(v16i8 x): 519 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 520 // %int = bitcast <16 x i1> %cmp to i16 521 // %res = zext i16 %int to i32 522 unsigned NumElts = ArgTy->getNumElements(); 523 Type *IntegerVecTy = VectorType::getInteger(ArgTy); 524 Type *IntegerTy = Builder.getIntNTy(NumElts); 525 526 Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy); 527 Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy)); 528 Res = Builder.CreateBitCast(Res, IntegerTy); 529 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 530 return Res; 531 } 532 533 static Value *simplifyX86addcarry(const IntrinsicInst &II, 534 InstCombiner::BuilderTy &Builder) { 535 Value *CarryIn = II.getArgOperand(0); 536 Value *Op1 = II.getArgOperand(1); 537 Value *Op2 = II.getArgOperand(2); 538 Type *RetTy = II.getType(); 539 Type *OpTy = Op1->getType(); 540 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 541 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 542 "Unexpected types for x86 addcarry"); 543 544 // If carry-in is zero, this is just an unsigned add with overflow. 545 if (match(CarryIn, PatternMatch::m_ZeroInt())) { 546 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 547 {Op1, Op2}); 548 // The types have to be adjusted to match the x86 call types. 549 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 550 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 551 Builder.getInt8Ty()); 552 Value *Res = UndefValue::get(RetTy); 553 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 554 return Builder.CreateInsertValue(Res, UAddResult, 1); 555 } 556 557 return nullptr; 558 } 559 560 static Value *simplifyX86insertps(const IntrinsicInst &II, 561 InstCombiner::BuilderTy &Builder) { 562 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 563 if (!CInt) 564 return nullptr; 565 566 auto *VecTy = cast<FixedVectorType>(II.getType()); 567 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 568 569 // The immediate permute control byte looks like this: 570 // [3:0] - zero mask for each 32-bit lane 571 // [5:4] - select one 32-bit destination lane 572 // [7:6] - select one 32-bit source lane 573 574 uint8_t Imm = CInt->getZExtValue(); 575 uint8_t ZMask = Imm & 0xf; 576 uint8_t DestLane = (Imm >> 4) & 0x3; 577 uint8_t SourceLane = (Imm >> 6) & 0x3; 578 579 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 580 581 // If all zero mask bits are set, this was just a weird way to 582 // generate a zero vector. 583 if (ZMask == 0xf) 584 return ZeroVector; 585 586 // Initialize by passing all of the first source bits through. 587 int ShuffleMask[4] = {0, 1, 2, 3}; 588 589 // We may replace the second operand with the zero vector. 590 Value *V1 = II.getArgOperand(1); 591 592 if (ZMask) { 593 // If the zero mask is being used with a single input or the zero mask 594 // overrides the destination lane, this is a shuffle with the zero vector. 595 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 596 (ZMask & (1 << DestLane))) { 597 V1 = ZeroVector; 598 // We may still move 32-bits of the first source vector from one lane 599 // to another. 600 ShuffleMask[DestLane] = SourceLane; 601 // The zero mask may override the previous insert operation. 602 for (unsigned i = 0; i < 4; ++i) 603 if ((ZMask >> i) & 0x1) 604 ShuffleMask[i] = i + 4; 605 } else { 606 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 607 return nullptr; 608 } 609 } else { 610 // Replace the selected destination lane with the selected source lane. 611 ShuffleMask[DestLane] = SourceLane + 4; 612 } 613 614 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 615 } 616 617 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 618 /// or conversion to a shuffle vector. 619 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 620 ConstantInt *CILength, ConstantInt *CIIndex, 621 InstCombiner::BuilderTy &Builder) { 622 auto LowConstantHighUndef = [&](uint64_t Val) { 623 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 624 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 625 UndefValue::get(IntTy64)}; 626 return ConstantVector::get(Args); 627 }; 628 629 // See if we're dealing with constant values. 630 Constant *C0 = dyn_cast<Constant>(Op0); 631 ConstantInt *CI0 = 632 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 633 : nullptr; 634 635 // Attempt to constant fold. 636 if (CILength && CIIndex) { 637 // From AMD documentation: "The bit index and field length are each six 638 // bits in length other bits of the field are ignored." 639 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 640 APInt APLength = CILength->getValue().zextOrTrunc(6); 641 642 unsigned Index = APIndex.getZExtValue(); 643 644 // From AMD documentation: "a value of zero in the field length is 645 // defined as length of 64". 646 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 647 648 // From AMD documentation: "If the sum of the bit index + length field 649 // is greater than 64, the results are undefined". 650 unsigned End = Index + Length; 651 652 // Note that both field index and field length are 8-bit quantities. 653 // Since variables 'Index' and 'Length' are unsigned values 654 // obtained from zero-extending field index and field length 655 // respectively, their sum should never wrap around. 656 if (End > 64) 657 return UndefValue::get(II.getType()); 658 659 // If we are inserting whole bytes, we can convert this to a shuffle. 660 // Lowering can recognize EXTRQI shuffle masks. 661 if ((Length % 8) == 0 && (Index % 8) == 0) { 662 // Convert bit indices to byte indices. 663 Length /= 8; 664 Index /= 8; 665 666 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 667 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 668 669 SmallVector<int, 16> ShuffleMask; 670 for (int i = 0; i != (int)Length; ++i) 671 ShuffleMask.push_back(i + Index); 672 for (int i = Length; i != 8; ++i) 673 ShuffleMask.push_back(i + 16); 674 for (int i = 8; i != 16; ++i) 675 ShuffleMask.push_back(-1); 676 677 Value *SV = Builder.CreateShuffleVector( 678 Builder.CreateBitCast(Op0, ShufTy), 679 ConstantAggregateZero::get(ShufTy), ShuffleMask); 680 return Builder.CreateBitCast(SV, II.getType()); 681 } 682 683 // Constant Fold - shift Index'th bit to lowest position and mask off 684 // Length bits. 685 if (CI0) { 686 APInt Elt = CI0->getValue(); 687 Elt.lshrInPlace(Index); 688 Elt = Elt.zextOrTrunc(Length); 689 return LowConstantHighUndef(Elt.getZExtValue()); 690 } 691 692 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 693 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 694 Value *Args[] = {Op0, CILength, CIIndex}; 695 Module *M = II.getModule(); 696 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 697 return Builder.CreateCall(F, Args); 698 } 699 } 700 701 // Constant Fold - extraction from zero is always {zero, undef}. 702 if (CI0 && CI0->isZero()) 703 return LowConstantHighUndef(0); 704 705 return nullptr; 706 } 707 708 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 709 /// folding or conversion to a shuffle vector. 710 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 711 APInt APLength, APInt APIndex, 712 InstCombiner::BuilderTy &Builder) { 713 // From AMD documentation: "The bit index and field length are each six bits 714 // in length other bits of the field are ignored." 715 APIndex = APIndex.zextOrTrunc(6); 716 APLength = APLength.zextOrTrunc(6); 717 718 // Attempt to constant fold. 719 unsigned Index = APIndex.getZExtValue(); 720 721 // From AMD documentation: "a value of zero in the field length is 722 // defined as length of 64". 723 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 724 725 // From AMD documentation: "If the sum of the bit index + length field 726 // is greater than 64, the results are undefined". 727 unsigned End = Index + Length; 728 729 // Note that both field index and field length are 8-bit quantities. 730 // Since variables 'Index' and 'Length' are unsigned values 731 // obtained from zero-extending field index and field length 732 // respectively, their sum should never wrap around. 733 if (End > 64) 734 return UndefValue::get(II.getType()); 735 736 // If we are inserting whole bytes, we can convert this to a shuffle. 737 // Lowering can recognize INSERTQI shuffle masks. 738 if ((Length % 8) == 0 && (Index % 8) == 0) { 739 // Convert bit indices to byte indices. 740 Length /= 8; 741 Index /= 8; 742 743 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 744 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 745 746 SmallVector<int, 16> ShuffleMask; 747 for (int i = 0; i != (int)Index; ++i) 748 ShuffleMask.push_back(i); 749 for (int i = 0; i != (int)Length; ++i) 750 ShuffleMask.push_back(i + 16); 751 for (int i = Index + Length; i != 8; ++i) 752 ShuffleMask.push_back(i); 753 for (int i = 8; i != 16; ++i) 754 ShuffleMask.push_back(-1); 755 756 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 757 Builder.CreateBitCast(Op1, ShufTy), 758 ShuffleMask); 759 return Builder.CreateBitCast(SV, II.getType()); 760 } 761 762 // See if we're dealing with constant values. 763 Constant *C0 = dyn_cast<Constant>(Op0); 764 Constant *C1 = dyn_cast<Constant>(Op1); 765 ConstantInt *CI00 = 766 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 767 : nullptr; 768 ConstantInt *CI10 = 769 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 770 : nullptr; 771 772 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 773 if (CI00 && CI10) { 774 APInt V00 = CI00->getValue(); 775 APInt V10 = CI10->getValue(); 776 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 777 V00 = V00 & ~Mask; 778 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 779 APInt Val = V00 | V10; 780 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 781 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 782 UndefValue::get(IntTy64)}; 783 return ConstantVector::get(Args); 784 } 785 786 // If we were an INSERTQ call, we'll save demanded elements if we convert to 787 // INSERTQI. 788 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 789 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 790 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 791 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 792 793 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 794 Module *M = II.getModule(); 795 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 796 return Builder.CreateCall(F, Args); 797 } 798 799 return nullptr; 800 } 801 802 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 803 static Value *simplifyX86pshufb(const IntrinsicInst &II, 804 InstCombiner::BuilderTy &Builder) { 805 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 806 if (!V) 807 return nullptr; 808 809 auto *VecTy = cast<FixedVectorType>(II.getType()); 810 unsigned NumElts = VecTy->getNumElements(); 811 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 812 "Unexpected number of elements in shuffle mask!"); 813 814 // Construct a shuffle mask from constant integers or UNDEFs. 815 int Indexes[64]; 816 817 // Each byte in the shuffle control mask forms an index to permute the 818 // corresponding byte in the destination operand. 819 for (unsigned I = 0; I < NumElts; ++I) { 820 Constant *COp = V->getAggregateElement(I); 821 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 822 return nullptr; 823 824 if (isa<UndefValue>(COp)) { 825 Indexes[I] = -1; 826 continue; 827 } 828 829 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 830 831 // If the most significant bit (bit[7]) of each byte of the shuffle 832 // control mask is set, then zero is written in the result byte. 833 // The zero vector is in the right-hand side of the resulting 834 // shufflevector. 835 836 // The value of each index for the high 128-bit lane is the least 837 // significant 4 bits of the respective shuffle control byte. 838 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 839 Indexes[I] = Index; 840 } 841 842 auto V1 = II.getArgOperand(0); 843 auto V2 = Constant::getNullValue(VecTy); 844 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); 845 } 846 847 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 848 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 849 InstCombiner::BuilderTy &Builder) { 850 Constant *V = dyn_cast<Constant>(II.getArgOperand(1)); 851 if (!V) 852 return nullptr; 853 854 auto *VecTy = cast<FixedVectorType>(II.getType()); 855 unsigned NumElts = VecTy->getNumElements(); 856 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 857 unsigned NumLaneElts = IsPD ? 2 : 4; 858 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 859 860 // Construct a shuffle mask from constant integers or UNDEFs. 861 int Indexes[16]; 862 863 // The intrinsics only read one or two bits, clear the rest. 864 for (unsigned I = 0; I < NumElts; ++I) { 865 Constant *COp = V->getAggregateElement(I); 866 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 867 return nullptr; 868 869 if (isa<UndefValue>(COp)) { 870 Indexes[I] = -1; 871 continue; 872 } 873 874 APInt Index = cast<ConstantInt>(COp)->getValue(); 875 Index = Index.zextOrTrunc(32).getLoBits(2); 876 877 // The PD variants uses bit 1 to select per-lane element index, so 878 // shift down to convert to generic shuffle mask index. 879 if (IsPD) 880 Index.lshrInPlace(1); 881 882 // The _256 variants are a bit trickier since the mask bits always index 883 // into the corresponding 128 half. In order to convert to a generic 884 // shuffle, we have to make that explicit. 885 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 886 887 Indexes[I] = Index.getZExtValue(); 888 } 889 890 auto V1 = II.getArgOperand(0); 891 auto V2 = UndefValue::get(V1->getType()); 892 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts)); 893 } 894 895 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 896 static Value *simplifyX86vpermv(const IntrinsicInst &II, 897 InstCombiner::BuilderTy &Builder) { 898 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 899 if (!V) 900 return nullptr; 901 902 auto *VecTy = cast<FixedVectorType>(II.getType()); 903 unsigned Size = VecTy->getNumElements(); 904 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 905 "Unexpected shuffle mask size"); 906 907 // Construct a shuffle mask from constant integers or UNDEFs. 908 int Indexes[64]; 909 910 for (unsigned I = 0; I < Size; ++I) { 911 Constant *COp = V->getAggregateElement(I); 912 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 913 return nullptr; 914 915 if (isa<UndefValue>(COp)) { 916 Indexes[I] = -1; 917 continue; 918 } 919 920 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 921 Index &= Size - 1; 922 Indexes[I] = Index; 923 } 924 925 auto V1 = II.getArgOperand(0); 926 auto V2 = UndefValue::get(VecTy); 927 return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, Size)); 928 } 929 930 Optional<Instruction *> 931 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 932 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 933 unsigned DemandedWidth) { 934 APInt UndefElts(Width, 0); 935 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 936 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 937 }; 938 939 Intrinsic::ID IID = II.getIntrinsicID(); 940 switch (IID) { 941 case Intrinsic::x86_bmi_bextr_32: 942 case Intrinsic::x86_bmi_bextr_64: 943 case Intrinsic::x86_tbm_bextri_u32: 944 case Intrinsic::x86_tbm_bextri_u64: 945 // If the RHS is a constant we can try some simplifications. 946 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 947 uint64_t Shift = C->getZExtValue(); 948 uint64_t Length = (Shift >> 8) & 0xff; 949 Shift &= 0xff; 950 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 951 // If the length is 0 or the shift is out of range, replace with zero. 952 if (Length == 0 || Shift >= BitWidth) { 953 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 954 } 955 // If the LHS is also a constant, we can completely constant fold this. 956 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 957 uint64_t Result = InC->getZExtValue() >> Shift; 958 if (Length > BitWidth) 959 Length = BitWidth; 960 Result &= maskTrailingOnes<uint64_t>(Length); 961 return IC.replaceInstUsesWith(II, 962 ConstantInt::get(II.getType(), Result)); 963 } 964 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 965 // are only masking bits that a shift already cleared? 966 } 967 break; 968 969 case Intrinsic::x86_bmi_bzhi_32: 970 case Intrinsic::x86_bmi_bzhi_64: 971 // If the RHS is a constant we can try some simplifications. 972 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 973 uint64_t Index = C->getZExtValue() & 0xff; 974 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 975 if (Index >= BitWidth) { 976 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 977 } 978 if (Index == 0) { 979 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 980 } 981 // If the LHS is also a constant, we can completely constant fold this. 982 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 983 uint64_t Result = InC->getZExtValue(); 984 Result &= maskTrailingOnes<uint64_t>(Index); 985 return IC.replaceInstUsesWith(II, 986 ConstantInt::get(II.getType(), Result)); 987 } 988 // TODO should we convert this to an AND if the RHS is constant? 989 } 990 break; 991 case Intrinsic::x86_bmi_pext_32: 992 case Intrinsic::x86_bmi_pext_64: 993 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 994 if (MaskC->isNullValue()) { 995 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 996 } 997 if (MaskC->isAllOnesValue()) { 998 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 999 } 1000 1001 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1002 uint64_t Src = SrcC->getZExtValue(); 1003 uint64_t Mask = MaskC->getZExtValue(); 1004 uint64_t Result = 0; 1005 uint64_t BitToSet = 1; 1006 1007 while (Mask) { 1008 // Isolate lowest set bit. 1009 uint64_t BitToTest = Mask & -Mask; 1010 if (BitToTest & Src) 1011 Result |= BitToSet; 1012 1013 BitToSet <<= 1; 1014 // Clear lowest set bit. 1015 Mask &= Mask - 1; 1016 } 1017 1018 return IC.replaceInstUsesWith(II, 1019 ConstantInt::get(II.getType(), Result)); 1020 } 1021 } 1022 break; 1023 case Intrinsic::x86_bmi_pdep_32: 1024 case Intrinsic::x86_bmi_pdep_64: 1025 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 1026 if (MaskC->isNullValue()) { 1027 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 1028 } 1029 if (MaskC->isAllOnesValue()) { 1030 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 1031 } 1032 1033 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 1034 uint64_t Src = SrcC->getZExtValue(); 1035 uint64_t Mask = MaskC->getZExtValue(); 1036 uint64_t Result = 0; 1037 uint64_t BitToTest = 1; 1038 1039 while (Mask) { 1040 // Isolate lowest set bit. 1041 uint64_t BitToSet = Mask & -Mask; 1042 if (BitToTest & Src) 1043 Result |= BitToSet; 1044 1045 BitToTest <<= 1; 1046 // Clear lowest set bit; 1047 Mask &= Mask - 1; 1048 } 1049 1050 return IC.replaceInstUsesWith(II, 1051 ConstantInt::get(II.getType(), Result)); 1052 } 1053 } 1054 break; 1055 1056 case Intrinsic::x86_sse_cvtss2si: 1057 case Intrinsic::x86_sse_cvtss2si64: 1058 case Intrinsic::x86_sse_cvttss2si: 1059 case Intrinsic::x86_sse_cvttss2si64: 1060 case Intrinsic::x86_sse2_cvtsd2si: 1061 case Intrinsic::x86_sse2_cvtsd2si64: 1062 case Intrinsic::x86_sse2_cvttsd2si: 1063 case Intrinsic::x86_sse2_cvttsd2si64: 1064 case Intrinsic::x86_avx512_vcvtss2si32: 1065 case Intrinsic::x86_avx512_vcvtss2si64: 1066 case Intrinsic::x86_avx512_vcvtss2usi32: 1067 case Intrinsic::x86_avx512_vcvtss2usi64: 1068 case Intrinsic::x86_avx512_vcvtsd2si32: 1069 case Intrinsic::x86_avx512_vcvtsd2si64: 1070 case Intrinsic::x86_avx512_vcvtsd2usi32: 1071 case Intrinsic::x86_avx512_vcvtsd2usi64: 1072 case Intrinsic::x86_avx512_cvttss2si: 1073 case Intrinsic::x86_avx512_cvttss2si64: 1074 case Intrinsic::x86_avx512_cvttss2usi: 1075 case Intrinsic::x86_avx512_cvttss2usi64: 1076 case Intrinsic::x86_avx512_cvttsd2si: 1077 case Intrinsic::x86_avx512_cvttsd2si64: 1078 case Intrinsic::x86_avx512_cvttsd2usi: 1079 case Intrinsic::x86_avx512_cvttsd2usi64: { 1080 // These intrinsics only demand the 0th element of their input vectors. If 1081 // we can simplify the input based on that, do so now. 1082 Value *Arg = II.getArgOperand(0); 1083 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 1084 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 1085 return IC.replaceOperand(II, 0, V); 1086 } 1087 break; 1088 } 1089 1090 case Intrinsic::x86_mmx_pmovmskb: 1091 case Intrinsic::x86_sse_movmsk_ps: 1092 case Intrinsic::x86_sse2_movmsk_pd: 1093 case Intrinsic::x86_sse2_pmovmskb_128: 1094 case Intrinsic::x86_avx_movmsk_pd_256: 1095 case Intrinsic::x86_avx_movmsk_ps_256: 1096 case Intrinsic::x86_avx2_pmovmskb: 1097 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 1098 return IC.replaceInstUsesWith(II, V); 1099 } 1100 break; 1101 1102 case Intrinsic::x86_sse_comieq_ss: 1103 case Intrinsic::x86_sse_comige_ss: 1104 case Intrinsic::x86_sse_comigt_ss: 1105 case Intrinsic::x86_sse_comile_ss: 1106 case Intrinsic::x86_sse_comilt_ss: 1107 case Intrinsic::x86_sse_comineq_ss: 1108 case Intrinsic::x86_sse_ucomieq_ss: 1109 case Intrinsic::x86_sse_ucomige_ss: 1110 case Intrinsic::x86_sse_ucomigt_ss: 1111 case Intrinsic::x86_sse_ucomile_ss: 1112 case Intrinsic::x86_sse_ucomilt_ss: 1113 case Intrinsic::x86_sse_ucomineq_ss: 1114 case Intrinsic::x86_sse2_comieq_sd: 1115 case Intrinsic::x86_sse2_comige_sd: 1116 case Intrinsic::x86_sse2_comigt_sd: 1117 case Intrinsic::x86_sse2_comile_sd: 1118 case Intrinsic::x86_sse2_comilt_sd: 1119 case Intrinsic::x86_sse2_comineq_sd: 1120 case Intrinsic::x86_sse2_ucomieq_sd: 1121 case Intrinsic::x86_sse2_ucomige_sd: 1122 case Intrinsic::x86_sse2_ucomigt_sd: 1123 case Intrinsic::x86_sse2_ucomile_sd: 1124 case Intrinsic::x86_sse2_ucomilt_sd: 1125 case Intrinsic::x86_sse2_ucomineq_sd: 1126 case Intrinsic::x86_avx512_vcomi_ss: 1127 case Intrinsic::x86_avx512_vcomi_sd: 1128 case Intrinsic::x86_avx512_mask_cmp_ss: 1129 case Intrinsic::x86_avx512_mask_cmp_sd: { 1130 // These intrinsics only demand the 0th element of their input vectors. If 1131 // we can simplify the input based on that, do so now. 1132 bool MadeChange = false; 1133 Value *Arg0 = II.getArgOperand(0); 1134 Value *Arg1 = II.getArgOperand(1); 1135 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1136 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 1137 IC.replaceOperand(II, 0, V); 1138 MadeChange = true; 1139 } 1140 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 1141 IC.replaceOperand(II, 1, V); 1142 MadeChange = true; 1143 } 1144 if (MadeChange) { 1145 return &II; 1146 } 1147 break; 1148 } 1149 1150 case Intrinsic::x86_avx512_add_ps_512: 1151 case Intrinsic::x86_avx512_div_ps_512: 1152 case Intrinsic::x86_avx512_mul_ps_512: 1153 case Intrinsic::x86_avx512_sub_ps_512: 1154 case Intrinsic::x86_avx512_add_pd_512: 1155 case Intrinsic::x86_avx512_div_pd_512: 1156 case Intrinsic::x86_avx512_mul_pd_512: 1157 case Intrinsic::x86_avx512_sub_pd_512: 1158 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1159 // IR operations. 1160 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1161 if (R->getValue() == 4) { 1162 Value *Arg0 = II.getArgOperand(0); 1163 Value *Arg1 = II.getArgOperand(1); 1164 1165 Value *V; 1166 switch (IID) { 1167 default: 1168 llvm_unreachable("Case stmts out of sync!"); 1169 case Intrinsic::x86_avx512_add_ps_512: 1170 case Intrinsic::x86_avx512_add_pd_512: 1171 V = IC.Builder.CreateFAdd(Arg0, Arg1); 1172 break; 1173 case Intrinsic::x86_avx512_sub_ps_512: 1174 case Intrinsic::x86_avx512_sub_pd_512: 1175 V = IC.Builder.CreateFSub(Arg0, Arg1); 1176 break; 1177 case Intrinsic::x86_avx512_mul_ps_512: 1178 case Intrinsic::x86_avx512_mul_pd_512: 1179 V = IC.Builder.CreateFMul(Arg0, Arg1); 1180 break; 1181 case Intrinsic::x86_avx512_div_ps_512: 1182 case Intrinsic::x86_avx512_div_pd_512: 1183 V = IC.Builder.CreateFDiv(Arg0, Arg1); 1184 break; 1185 } 1186 1187 return IC.replaceInstUsesWith(II, V); 1188 } 1189 } 1190 break; 1191 1192 case Intrinsic::x86_avx512_mask_add_ss_round: 1193 case Intrinsic::x86_avx512_mask_div_ss_round: 1194 case Intrinsic::x86_avx512_mask_mul_ss_round: 1195 case Intrinsic::x86_avx512_mask_sub_ss_round: 1196 case Intrinsic::x86_avx512_mask_add_sd_round: 1197 case Intrinsic::x86_avx512_mask_div_sd_round: 1198 case Intrinsic::x86_avx512_mask_mul_sd_round: 1199 case Intrinsic::x86_avx512_mask_sub_sd_round: 1200 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 1201 // IR operations. 1202 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 1203 if (R->getValue() == 4) { 1204 // Extract the element as scalars. 1205 Value *Arg0 = II.getArgOperand(0); 1206 Value *Arg1 = II.getArgOperand(1); 1207 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 1208 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 1209 1210 Value *V; 1211 switch (IID) { 1212 default: 1213 llvm_unreachable("Case stmts out of sync!"); 1214 case Intrinsic::x86_avx512_mask_add_ss_round: 1215 case Intrinsic::x86_avx512_mask_add_sd_round: 1216 V = IC.Builder.CreateFAdd(LHS, RHS); 1217 break; 1218 case Intrinsic::x86_avx512_mask_sub_ss_round: 1219 case Intrinsic::x86_avx512_mask_sub_sd_round: 1220 V = IC.Builder.CreateFSub(LHS, RHS); 1221 break; 1222 case Intrinsic::x86_avx512_mask_mul_ss_round: 1223 case Intrinsic::x86_avx512_mask_mul_sd_round: 1224 V = IC.Builder.CreateFMul(LHS, RHS); 1225 break; 1226 case Intrinsic::x86_avx512_mask_div_ss_round: 1227 case Intrinsic::x86_avx512_mask_div_sd_round: 1228 V = IC.Builder.CreateFDiv(LHS, RHS); 1229 break; 1230 } 1231 1232 // Handle the masking aspect of the intrinsic. 1233 Value *Mask = II.getArgOperand(3); 1234 auto *C = dyn_cast<ConstantInt>(Mask); 1235 // We don't need a select if we know the mask bit is a 1. 1236 if (!C || !C->getValue()[0]) { 1237 // Cast the mask to an i1 vector and then extract the lowest element. 1238 auto *MaskTy = FixedVectorType::get( 1239 IC.Builder.getInt1Ty(), 1240 cast<IntegerType>(Mask->getType())->getBitWidth()); 1241 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 1242 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 1243 // Extract the lowest element from the passthru operand. 1244 Value *Passthru = 1245 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 1246 V = IC.Builder.CreateSelect(Mask, V, Passthru); 1247 } 1248 1249 // Insert the result back into the original argument 0. 1250 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 1251 1252 return IC.replaceInstUsesWith(II, V); 1253 } 1254 } 1255 break; 1256 1257 // Constant fold ashr( <A x Bi>, Ci ). 1258 // Constant fold lshr( <A x Bi>, Ci ). 1259 // Constant fold shl( <A x Bi>, Ci ). 1260 case Intrinsic::x86_sse2_psrai_d: 1261 case Intrinsic::x86_sse2_psrai_w: 1262 case Intrinsic::x86_avx2_psrai_d: 1263 case Intrinsic::x86_avx2_psrai_w: 1264 case Intrinsic::x86_avx512_psrai_q_128: 1265 case Intrinsic::x86_avx512_psrai_q_256: 1266 case Intrinsic::x86_avx512_psrai_d_512: 1267 case Intrinsic::x86_avx512_psrai_q_512: 1268 case Intrinsic::x86_avx512_psrai_w_512: 1269 case Intrinsic::x86_sse2_psrli_d: 1270 case Intrinsic::x86_sse2_psrli_q: 1271 case Intrinsic::x86_sse2_psrli_w: 1272 case Intrinsic::x86_avx2_psrli_d: 1273 case Intrinsic::x86_avx2_psrli_q: 1274 case Intrinsic::x86_avx2_psrli_w: 1275 case Intrinsic::x86_avx512_psrli_d_512: 1276 case Intrinsic::x86_avx512_psrli_q_512: 1277 case Intrinsic::x86_avx512_psrli_w_512: 1278 case Intrinsic::x86_sse2_pslli_d: 1279 case Intrinsic::x86_sse2_pslli_q: 1280 case Intrinsic::x86_sse2_pslli_w: 1281 case Intrinsic::x86_avx2_pslli_d: 1282 case Intrinsic::x86_avx2_pslli_q: 1283 case Intrinsic::x86_avx2_pslli_w: 1284 case Intrinsic::x86_avx512_pslli_d_512: 1285 case Intrinsic::x86_avx512_pslli_q_512: 1286 case Intrinsic::x86_avx512_pslli_w_512: 1287 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1288 return IC.replaceInstUsesWith(II, V); 1289 } 1290 break; 1291 1292 case Intrinsic::x86_sse2_psra_d: 1293 case Intrinsic::x86_sse2_psra_w: 1294 case Intrinsic::x86_avx2_psra_d: 1295 case Intrinsic::x86_avx2_psra_w: 1296 case Intrinsic::x86_avx512_psra_q_128: 1297 case Intrinsic::x86_avx512_psra_q_256: 1298 case Intrinsic::x86_avx512_psra_d_512: 1299 case Intrinsic::x86_avx512_psra_q_512: 1300 case Intrinsic::x86_avx512_psra_w_512: 1301 case Intrinsic::x86_sse2_psrl_d: 1302 case Intrinsic::x86_sse2_psrl_q: 1303 case Intrinsic::x86_sse2_psrl_w: 1304 case Intrinsic::x86_avx2_psrl_d: 1305 case Intrinsic::x86_avx2_psrl_q: 1306 case Intrinsic::x86_avx2_psrl_w: 1307 case Intrinsic::x86_avx512_psrl_d_512: 1308 case Intrinsic::x86_avx512_psrl_q_512: 1309 case Intrinsic::x86_avx512_psrl_w_512: 1310 case Intrinsic::x86_sse2_psll_d: 1311 case Intrinsic::x86_sse2_psll_q: 1312 case Intrinsic::x86_sse2_psll_w: 1313 case Intrinsic::x86_avx2_psll_d: 1314 case Intrinsic::x86_avx2_psll_q: 1315 case Intrinsic::x86_avx2_psll_w: 1316 case Intrinsic::x86_avx512_psll_d_512: 1317 case Intrinsic::x86_avx512_psll_q_512: 1318 case Intrinsic::x86_avx512_psll_w_512: { 1319 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 1320 return IC.replaceInstUsesWith(II, V); 1321 } 1322 1323 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 1324 // operand to compute the shift amount. 1325 Value *Arg1 = II.getArgOperand(1); 1326 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 1327 "Unexpected packed shift size"); 1328 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 1329 1330 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 1331 return IC.replaceOperand(II, 1, V); 1332 } 1333 break; 1334 } 1335 1336 case Intrinsic::x86_avx2_psllv_d: 1337 case Intrinsic::x86_avx2_psllv_d_256: 1338 case Intrinsic::x86_avx2_psllv_q: 1339 case Intrinsic::x86_avx2_psllv_q_256: 1340 case Intrinsic::x86_avx512_psllv_d_512: 1341 case Intrinsic::x86_avx512_psllv_q_512: 1342 case Intrinsic::x86_avx512_psllv_w_128: 1343 case Intrinsic::x86_avx512_psllv_w_256: 1344 case Intrinsic::x86_avx512_psllv_w_512: 1345 case Intrinsic::x86_avx2_psrav_d: 1346 case Intrinsic::x86_avx2_psrav_d_256: 1347 case Intrinsic::x86_avx512_psrav_q_128: 1348 case Intrinsic::x86_avx512_psrav_q_256: 1349 case Intrinsic::x86_avx512_psrav_d_512: 1350 case Intrinsic::x86_avx512_psrav_q_512: 1351 case Intrinsic::x86_avx512_psrav_w_128: 1352 case Intrinsic::x86_avx512_psrav_w_256: 1353 case Intrinsic::x86_avx512_psrav_w_512: 1354 case Intrinsic::x86_avx2_psrlv_d: 1355 case Intrinsic::x86_avx2_psrlv_d_256: 1356 case Intrinsic::x86_avx2_psrlv_q: 1357 case Intrinsic::x86_avx2_psrlv_q_256: 1358 case Intrinsic::x86_avx512_psrlv_d_512: 1359 case Intrinsic::x86_avx512_psrlv_q_512: 1360 case Intrinsic::x86_avx512_psrlv_w_128: 1361 case Intrinsic::x86_avx512_psrlv_w_256: 1362 case Intrinsic::x86_avx512_psrlv_w_512: 1363 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 1364 return IC.replaceInstUsesWith(II, V); 1365 } 1366 break; 1367 1368 case Intrinsic::x86_sse2_packssdw_128: 1369 case Intrinsic::x86_sse2_packsswb_128: 1370 case Intrinsic::x86_avx2_packssdw: 1371 case Intrinsic::x86_avx2_packsswb: 1372 case Intrinsic::x86_avx512_packssdw_512: 1373 case Intrinsic::x86_avx512_packsswb_512: 1374 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 1375 return IC.replaceInstUsesWith(II, V); 1376 } 1377 break; 1378 1379 case Intrinsic::x86_sse2_packuswb_128: 1380 case Intrinsic::x86_sse41_packusdw: 1381 case Intrinsic::x86_avx2_packusdw: 1382 case Intrinsic::x86_avx2_packuswb: 1383 case Intrinsic::x86_avx512_packusdw_512: 1384 case Intrinsic::x86_avx512_packuswb_512: 1385 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 1386 return IC.replaceInstUsesWith(II, V); 1387 } 1388 break; 1389 1390 case Intrinsic::x86_pclmulqdq: 1391 case Intrinsic::x86_pclmulqdq_256: 1392 case Intrinsic::x86_pclmulqdq_512: { 1393 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 1394 unsigned Imm = C->getZExtValue(); 1395 1396 bool MadeChange = false; 1397 Value *Arg0 = II.getArgOperand(0); 1398 Value *Arg1 = II.getArgOperand(1); 1399 unsigned VWidth = 1400 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 1401 1402 APInt UndefElts1(VWidth, 0); 1403 APInt DemandedElts1 = 1404 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 1405 if (Value *V = 1406 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 1407 IC.replaceOperand(II, 0, V); 1408 MadeChange = true; 1409 } 1410 1411 APInt UndefElts2(VWidth, 0); 1412 APInt DemandedElts2 = 1413 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 1414 if (Value *V = 1415 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 1416 IC.replaceOperand(II, 1, V); 1417 MadeChange = true; 1418 } 1419 1420 // If either input elements are undef, the result is zero. 1421 if (DemandedElts1.isSubsetOf(UndefElts1) || 1422 DemandedElts2.isSubsetOf(UndefElts2)) { 1423 return IC.replaceInstUsesWith(II, 1424 ConstantAggregateZero::get(II.getType())); 1425 } 1426 1427 if (MadeChange) { 1428 return &II; 1429 } 1430 } 1431 break; 1432 } 1433 1434 case Intrinsic::x86_sse41_insertps: 1435 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 1436 return IC.replaceInstUsesWith(II, V); 1437 } 1438 break; 1439 1440 case Intrinsic::x86_sse4a_extrq: { 1441 Value *Op0 = II.getArgOperand(0); 1442 Value *Op1 = II.getArgOperand(1); 1443 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1444 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1445 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1446 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1447 VWidth1 == 16 && "Unexpected operand sizes"); 1448 1449 // See if we're dealing with constant values. 1450 Constant *C1 = dyn_cast<Constant>(Op1); 1451 ConstantInt *CILength = 1452 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1453 : nullptr; 1454 ConstantInt *CIIndex = 1455 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1456 : nullptr; 1457 1458 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 1459 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1460 return IC.replaceInstUsesWith(II, V); 1461 } 1462 1463 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 1464 // operands and the lowest 16-bits of the second. 1465 bool MadeChange = false; 1466 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1467 IC.replaceOperand(II, 0, V); 1468 MadeChange = true; 1469 } 1470 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 1471 IC.replaceOperand(II, 1, V); 1472 MadeChange = true; 1473 } 1474 if (MadeChange) { 1475 return &II; 1476 } 1477 break; 1478 } 1479 1480 case Intrinsic::x86_sse4a_extrqi: { 1481 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 1482 // bits of the lower 64-bits. The upper 64-bits are undefined. 1483 Value *Op0 = II.getArgOperand(0); 1484 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1485 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1486 "Unexpected operand size"); 1487 1488 // See if we're dealing with constant values. 1489 ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 1490 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1491 1492 // Attempt to simplify to a constant or shuffle vector. 1493 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 1494 return IC.replaceInstUsesWith(II, V); 1495 } 1496 1497 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 1498 // operand. 1499 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1500 return IC.replaceOperand(II, 0, V); 1501 } 1502 break; 1503 } 1504 1505 case Intrinsic::x86_sse4a_insertq: { 1506 Value *Op0 = II.getArgOperand(0); 1507 Value *Op1 = II.getArgOperand(1); 1508 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1509 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1510 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 1511 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 1512 "Unexpected operand size"); 1513 1514 // See if we're dealing with constant values. 1515 Constant *C1 = dyn_cast<Constant>(Op1); 1516 ConstantInt *CI11 = 1517 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 1518 : nullptr; 1519 1520 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 1521 if (CI11) { 1522 const APInt &V11 = CI11->getValue(); 1523 APInt Len = V11.zextOrTrunc(6); 1524 APInt Idx = V11.lshr(8).zextOrTrunc(6); 1525 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1526 return IC.replaceInstUsesWith(II, V); 1527 } 1528 } 1529 1530 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 1531 // operand. 1532 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 1533 return IC.replaceOperand(II, 0, V); 1534 } 1535 break; 1536 } 1537 1538 case Intrinsic::x86_sse4a_insertqi: { 1539 // INSERTQI: Extract lowest Length bits from lower half of second source and 1540 // insert over first source starting at Index bit. The upper 64-bits are 1541 // undefined. 1542 Value *Op0 = II.getArgOperand(0); 1543 Value *Op1 = II.getArgOperand(1); 1544 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 1545 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 1546 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 1547 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 1548 VWidth1 == 2 && "Unexpected operand sizes"); 1549 1550 // See if we're dealing with constant values. 1551 ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1552 ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 1553 1554 // Attempt to simplify to a constant or shuffle vector. 1555 if (CILength && CIIndex) { 1556 APInt Len = CILength->getValue().zextOrTrunc(6); 1557 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 1558 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 1559 return IC.replaceInstUsesWith(II, V); 1560 } 1561 } 1562 1563 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 1564 // operands. 1565 bool MadeChange = false; 1566 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 1567 IC.replaceOperand(II, 0, V); 1568 MadeChange = true; 1569 } 1570 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 1571 IC.replaceOperand(II, 1, V); 1572 MadeChange = true; 1573 } 1574 if (MadeChange) { 1575 return &II; 1576 } 1577 break; 1578 } 1579 1580 case Intrinsic::x86_sse41_pblendvb: 1581 case Intrinsic::x86_sse41_blendvps: 1582 case Intrinsic::x86_sse41_blendvpd: 1583 case Intrinsic::x86_avx_blendv_ps_256: 1584 case Intrinsic::x86_avx_blendv_pd_256: 1585 case Intrinsic::x86_avx2_pblendvb: { 1586 // fold (blend A, A, Mask) -> A 1587 Value *Op0 = II.getArgOperand(0); 1588 Value *Op1 = II.getArgOperand(1); 1589 Value *Mask = II.getArgOperand(2); 1590 if (Op0 == Op1) { 1591 return IC.replaceInstUsesWith(II, Op0); 1592 } 1593 1594 // Zero Mask - select 1st argument. 1595 if (isa<ConstantAggregateZero>(Mask)) { 1596 return IC.replaceInstUsesWith(II, Op0); 1597 } 1598 1599 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 1600 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 1601 Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask); 1602 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 1603 } 1604 1605 // Convert to a vector select if we can bypass casts and find a boolean 1606 // vector condition value. 1607 Value *BoolVec; 1608 Mask = InstCombiner::peekThroughBitcast(Mask); 1609 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 1610 BoolVec->getType()->isVectorTy() && 1611 BoolVec->getType()->getScalarSizeInBits() == 1) { 1612 assert(Mask->getType()->getPrimitiveSizeInBits() == 1613 II.getType()->getPrimitiveSizeInBits() && 1614 "Not expecting mask and operands with different sizes"); 1615 1616 unsigned NumMaskElts = 1617 cast<FixedVectorType>(Mask->getType())->getNumElements(); 1618 unsigned NumOperandElts = 1619 cast<FixedVectorType>(II.getType())->getNumElements(); 1620 if (NumMaskElts == NumOperandElts) { 1621 return SelectInst::Create(BoolVec, Op1, Op0); 1622 } 1623 1624 // If the mask has less elements than the operands, each mask bit maps to 1625 // multiple elements of the operands. Bitcast back and forth. 1626 if (NumMaskElts < NumOperandElts) { 1627 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType()); 1628 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType()); 1629 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 1630 return new BitCastInst(Sel, II.getType()); 1631 } 1632 } 1633 1634 break; 1635 } 1636 1637 case Intrinsic::x86_ssse3_pshuf_b_128: 1638 case Intrinsic::x86_avx2_pshuf_b: 1639 case Intrinsic::x86_avx512_pshuf_b_512: 1640 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 1641 return IC.replaceInstUsesWith(II, V); 1642 } 1643 break; 1644 1645 case Intrinsic::x86_avx_vpermilvar_ps: 1646 case Intrinsic::x86_avx_vpermilvar_ps_256: 1647 case Intrinsic::x86_avx512_vpermilvar_ps_512: 1648 case Intrinsic::x86_avx_vpermilvar_pd: 1649 case Intrinsic::x86_avx_vpermilvar_pd_256: 1650 case Intrinsic::x86_avx512_vpermilvar_pd_512: 1651 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 1652 return IC.replaceInstUsesWith(II, V); 1653 } 1654 break; 1655 1656 case Intrinsic::x86_avx2_permd: 1657 case Intrinsic::x86_avx2_permps: 1658 case Intrinsic::x86_avx512_permvar_df_256: 1659 case Intrinsic::x86_avx512_permvar_df_512: 1660 case Intrinsic::x86_avx512_permvar_di_256: 1661 case Intrinsic::x86_avx512_permvar_di_512: 1662 case Intrinsic::x86_avx512_permvar_hi_128: 1663 case Intrinsic::x86_avx512_permvar_hi_256: 1664 case Intrinsic::x86_avx512_permvar_hi_512: 1665 case Intrinsic::x86_avx512_permvar_qi_128: 1666 case Intrinsic::x86_avx512_permvar_qi_256: 1667 case Intrinsic::x86_avx512_permvar_qi_512: 1668 case Intrinsic::x86_avx512_permvar_sf_512: 1669 case Intrinsic::x86_avx512_permvar_si_512: 1670 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 1671 return IC.replaceInstUsesWith(II, V); 1672 } 1673 break; 1674 1675 case Intrinsic::x86_avx_maskload_ps: 1676 case Intrinsic::x86_avx_maskload_pd: 1677 case Intrinsic::x86_avx_maskload_ps_256: 1678 case Intrinsic::x86_avx_maskload_pd_256: 1679 case Intrinsic::x86_avx2_maskload_d: 1680 case Intrinsic::x86_avx2_maskload_q: 1681 case Intrinsic::x86_avx2_maskload_d_256: 1682 case Intrinsic::x86_avx2_maskload_q_256: 1683 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 1684 return I; 1685 } 1686 break; 1687 1688 case Intrinsic::x86_sse2_maskmov_dqu: 1689 case Intrinsic::x86_avx_maskstore_ps: 1690 case Intrinsic::x86_avx_maskstore_pd: 1691 case Intrinsic::x86_avx_maskstore_ps_256: 1692 case Intrinsic::x86_avx_maskstore_pd_256: 1693 case Intrinsic::x86_avx2_maskstore_d: 1694 case Intrinsic::x86_avx2_maskstore_q: 1695 case Intrinsic::x86_avx2_maskstore_d_256: 1696 case Intrinsic::x86_avx2_maskstore_q_256: 1697 if (simplifyX86MaskedStore(II, IC)) { 1698 return nullptr; 1699 } 1700 break; 1701 1702 case Intrinsic::x86_addcarry_32: 1703 case Intrinsic::x86_addcarry_64: 1704 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 1705 return IC.replaceInstUsesWith(II, V); 1706 } 1707 break; 1708 1709 default: 1710 break; 1711 } 1712 return None; 1713 } 1714 1715 Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 1716 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 1717 bool &KnownBitsComputed) const { 1718 switch (II.getIntrinsicID()) { 1719 default: 1720 break; 1721 case Intrinsic::x86_mmx_pmovmskb: 1722 case Intrinsic::x86_sse_movmsk_ps: 1723 case Intrinsic::x86_sse2_movmsk_pd: 1724 case Intrinsic::x86_sse2_pmovmskb_128: 1725 case Intrinsic::x86_avx_movmsk_ps_256: 1726 case Intrinsic::x86_avx_movmsk_pd_256: 1727 case Intrinsic::x86_avx2_pmovmskb: { 1728 // MOVMSK copies the vector elements' sign bits to the low bits 1729 // and zeros the high bits. 1730 unsigned ArgWidth; 1731 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 1732 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 1733 } else { 1734 auto Arg = II.getArgOperand(0); 1735 auto ArgType = cast<FixedVectorType>(Arg->getType()); 1736 ArgWidth = ArgType->getNumElements(); 1737 } 1738 1739 // If we don't need any of low bits then return zero, 1740 // we know that DemandedMask is non-zero already. 1741 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 1742 Type *VTy = II.getType(); 1743 if (DemandedElts.isNullValue()) { 1744 return ConstantInt::getNullValue(VTy); 1745 } 1746 1747 // We know that the upper bits are set to zero. 1748 Known.Zero.setBitsFrom(ArgWidth); 1749 KnownBitsComputed = true; 1750 break; 1751 } 1752 } 1753 return None; 1754 } 1755 1756 Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 1757 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1758 APInt &UndefElts2, APInt &UndefElts3, 1759 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1760 simplifyAndSetOp) const { 1761 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 1762 switch (II.getIntrinsicID()) { 1763 default: 1764 break; 1765 case Intrinsic::x86_xop_vfrcz_ss: 1766 case Intrinsic::x86_xop_vfrcz_sd: 1767 // The instructions for these intrinsics are speced to zero upper bits not 1768 // pass them through like other scalar intrinsics. So we shouldn't just 1769 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 1770 // Instead we should return a zero vector. 1771 if (!DemandedElts[0]) { 1772 IC.addToWorklist(&II); 1773 return ConstantAggregateZero::get(II.getType()); 1774 } 1775 1776 // Only the lower element is used. 1777 DemandedElts = 1; 1778 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1779 1780 // Only the lower element is undefined. The high elements are zero. 1781 UndefElts = UndefElts[0]; 1782 break; 1783 1784 // Unary scalar-as-vector operations that work column-wise. 1785 case Intrinsic::x86_sse_rcp_ss: 1786 case Intrinsic::x86_sse_rsqrt_ss: 1787 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1788 1789 // If lowest element of a scalar op isn't used then use Arg0. 1790 if (!DemandedElts[0]) { 1791 IC.addToWorklist(&II); 1792 return II.getArgOperand(0); 1793 } 1794 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 1795 // checks). 1796 break; 1797 1798 // Binary scalar-as-vector operations that work column-wise. The high 1799 // elements come from operand 0. The low element is a function of both 1800 // operands. 1801 case Intrinsic::x86_sse_min_ss: 1802 case Intrinsic::x86_sse_max_ss: 1803 case Intrinsic::x86_sse_cmp_ss: 1804 case Intrinsic::x86_sse2_min_sd: 1805 case Intrinsic::x86_sse2_max_sd: 1806 case Intrinsic::x86_sse2_cmp_sd: { 1807 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1808 1809 // If lowest element of a scalar op isn't used then use Arg0. 1810 if (!DemandedElts[0]) { 1811 IC.addToWorklist(&II); 1812 return II.getArgOperand(0); 1813 } 1814 1815 // Only lower element is used for operand 1. 1816 DemandedElts = 1; 1817 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1818 1819 // Lower element is undefined if both lower elements are undefined. 1820 // Consider things like undef&0. The result is known zero, not undef. 1821 if (!UndefElts2[0]) 1822 UndefElts.clearBit(0); 1823 1824 break; 1825 } 1826 1827 // Binary scalar-as-vector operations that work column-wise. The high 1828 // elements come from operand 0 and the low element comes from operand 1. 1829 case Intrinsic::x86_sse41_round_ss: 1830 case Intrinsic::x86_sse41_round_sd: { 1831 // Don't use the low element of operand 0. 1832 APInt DemandedElts2 = DemandedElts; 1833 DemandedElts2.clearBit(0); 1834 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 1835 1836 // If lowest element of a scalar op isn't used then use Arg0. 1837 if (!DemandedElts[0]) { 1838 IC.addToWorklist(&II); 1839 return II.getArgOperand(0); 1840 } 1841 1842 // Only lower element is used for operand 1. 1843 DemandedElts = 1; 1844 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1845 1846 // Take the high undef elements from operand 0 and take the lower element 1847 // from operand 1. 1848 UndefElts.clearBit(0); 1849 UndefElts |= UndefElts2[0]; 1850 break; 1851 } 1852 1853 // Three input scalar-as-vector operations that work column-wise. The high 1854 // elements come from operand 0 and the low element is a function of all 1855 // three inputs. 1856 case Intrinsic::x86_avx512_mask_add_ss_round: 1857 case Intrinsic::x86_avx512_mask_div_ss_round: 1858 case Intrinsic::x86_avx512_mask_mul_ss_round: 1859 case Intrinsic::x86_avx512_mask_sub_ss_round: 1860 case Intrinsic::x86_avx512_mask_max_ss_round: 1861 case Intrinsic::x86_avx512_mask_min_ss_round: 1862 case Intrinsic::x86_avx512_mask_add_sd_round: 1863 case Intrinsic::x86_avx512_mask_div_sd_round: 1864 case Intrinsic::x86_avx512_mask_mul_sd_round: 1865 case Intrinsic::x86_avx512_mask_sub_sd_round: 1866 case Intrinsic::x86_avx512_mask_max_sd_round: 1867 case Intrinsic::x86_avx512_mask_min_sd_round: 1868 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 1869 1870 // If lowest element of a scalar op isn't used then use Arg0. 1871 if (!DemandedElts[0]) { 1872 IC.addToWorklist(&II); 1873 return II.getArgOperand(0); 1874 } 1875 1876 // Only lower element is used for operand 1 and 2. 1877 DemandedElts = 1; 1878 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 1879 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 1880 1881 // Lower element is undefined if all three lower elements are undefined. 1882 // Consider things like undef&0. The result is known zero, not undef. 1883 if (!UndefElts2[0] || !UndefElts3[0]) 1884 UndefElts.clearBit(0); 1885 1886 break; 1887 1888 case Intrinsic::x86_sse2_packssdw_128: 1889 case Intrinsic::x86_sse2_packsswb_128: 1890 case Intrinsic::x86_sse2_packuswb_128: 1891 case Intrinsic::x86_sse41_packusdw: 1892 case Intrinsic::x86_avx2_packssdw: 1893 case Intrinsic::x86_avx2_packsswb: 1894 case Intrinsic::x86_avx2_packusdw: 1895 case Intrinsic::x86_avx2_packuswb: 1896 case Intrinsic::x86_avx512_packssdw_512: 1897 case Intrinsic::x86_avx512_packsswb_512: 1898 case Intrinsic::x86_avx512_packusdw_512: 1899 case Intrinsic::x86_avx512_packuswb_512: { 1900 auto *Ty0 = II.getArgOperand(0)->getType(); 1901 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 1902 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 1903 1904 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 1905 unsigned VWidthPerLane = VWidth / NumLanes; 1906 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 1907 1908 // Per lane, pack the elements of the first input and then the second. 1909 // e.g. 1910 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 1911 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 1912 for (int OpNum = 0; OpNum != 2; ++OpNum) { 1913 APInt OpDemandedElts(InnerVWidth, 0); 1914 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1915 unsigned LaneIdx = Lane * VWidthPerLane; 1916 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 1917 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 1918 if (DemandedElts[Idx]) 1919 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 1920 } 1921 } 1922 1923 // Demand elements from the operand. 1924 APInt OpUndefElts(InnerVWidth, 0); 1925 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 1926 1927 // Pack the operand's UNDEF elements, one lane at a time. 1928 OpUndefElts = OpUndefElts.zext(VWidth); 1929 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 1930 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 1931 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 1932 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 1933 UndefElts |= LaneElts; 1934 } 1935 } 1936 break; 1937 } 1938 1939 // PSHUFB 1940 case Intrinsic::x86_ssse3_pshuf_b_128: 1941 case Intrinsic::x86_avx2_pshuf_b: 1942 case Intrinsic::x86_avx512_pshuf_b_512: 1943 // PERMILVAR 1944 case Intrinsic::x86_avx_vpermilvar_ps: 1945 case Intrinsic::x86_avx_vpermilvar_ps_256: 1946 case Intrinsic::x86_avx512_vpermilvar_ps_512: 1947 case Intrinsic::x86_avx_vpermilvar_pd: 1948 case Intrinsic::x86_avx_vpermilvar_pd_256: 1949 case Intrinsic::x86_avx512_vpermilvar_pd_512: 1950 // PERMV 1951 case Intrinsic::x86_avx2_permd: 1952 case Intrinsic::x86_avx2_permps: { 1953 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 1954 break; 1955 } 1956 1957 // SSE4A instructions leave the upper 64-bits of the 128-bit result 1958 // in an undefined state. 1959 case Intrinsic::x86_sse4a_extrq: 1960 case Intrinsic::x86_sse4a_extrqi: 1961 case Intrinsic::x86_sse4a_insertq: 1962 case Intrinsic::x86_sse4a_insertqi: 1963 UndefElts.setHighBits(VWidth / 2); 1964 break; 1965 } 1966 return None; 1967 } 1968