1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 #include <optional> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "x86tti" 26 27 /// Return a constant boolean vector that has true elements in all positions 28 /// where the input constant data vector has an element with the sign bit set. 29 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) { 30 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 31 V = ConstantExpr::getBitCast(V, IntTy); 32 V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT, 33 Constant::getNullValue(IntTy), V, DL); 34 assert(V && "Vector must be foldable"); 35 return V; 36 } 37 38 /// Convert the x86 XMM integer vector mask to a vector of bools based on 39 /// each element's most significant bit (the sign bit). 40 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) { 41 // Fold Constant Mask. 42 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 43 return getNegativeIsTrueBoolVec(ConstantMask, DL); 44 45 // Mask was extended from a boolean vector. 46 Value *ExtMask; 47 if (PatternMatch::match( 48 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 49 ExtMask->getType()->isIntOrIntVectorTy(1)) 50 return ExtMask; 51 52 return nullptr; 53 } 54 55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 56 // XMM register mask efficiently, we could transform all x86 masked intrinsics 57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 59 Value *Ptr = II.getOperand(0); 60 Value *Mask = II.getOperand(1); 61 Constant *ZeroVec = Constant::getNullValue(II.getType()); 62 63 // Zero Mask - masked load instruction creates a zero vector. 64 if (isa<ConstantAggregateZero>(Mask)) 65 return IC.replaceInstUsesWith(II, ZeroVec); 66 67 // The mask is constant or extended from a bool vector. Convert this x86 68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 70 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 71 // the LLVM intrinsic definition for the pointer argument. 72 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 73 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 74 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 75 76 // The pass-through vector for an x86 masked load is a zero vector. 77 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 78 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 79 return IC.replaceInstUsesWith(II, NewMaskedLoad); 80 } 81 82 return nullptr; 83 } 84 85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 86 // XMM register mask efficiently, we could transform all x86 masked intrinsics 87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 89 Value *Ptr = II.getOperand(0); 90 Value *Mask = II.getOperand(1); 91 Value *Vec = II.getOperand(2); 92 93 // Zero Mask - this masked store instruction does nothing. 94 if (isa<ConstantAggregateZero>(Mask)) { 95 IC.eraseInstFromFunction(II); 96 return true; 97 } 98 99 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 100 // anything else at this level. 101 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 102 return false; 103 104 // The mask is constant or extended from a bool vector. Convert this x86 105 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 106 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 107 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 108 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 109 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 110 111 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 112 113 // 'Replace uses' doesn't work for stores. Erase the original masked store. 114 IC.eraseInstFromFunction(II); 115 return true; 116 } 117 118 return false; 119 } 120 121 static Value *simplifyX86immShift(const IntrinsicInst &II, 122 InstCombiner::BuilderTy &Builder) { 123 bool LogicalShift = false; 124 bool ShiftLeft = false; 125 bool IsImm = false; 126 127 switch (II.getIntrinsicID()) { 128 default: 129 llvm_unreachable("Unexpected intrinsic!"); 130 case Intrinsic::x86_sse2_psrai_d: 131 case Intrinsic::x86_sse2_psrai_w: 132 case Intrinsic::x86_avx2_psrai_d: 133 case Intrinsic::x86_avx2_psrai_w: 134 case Intrinsic::x86_avx512_psrai_q_128: 135 case Intrinsic::x86_avx512_psrai_q_256: 136 case Intrinsic::x86_avx512_psrai_d_512: 137 case Intrinsic::x86_avx512_psrai_q_512: 138 case Intrinsic::x86_avx512_psrai_w_512: 139 IsImm = true; 140 [[fallthrough]]; 141 case Intrinsic::x86_sse2_psra_d: 142 case Intrinsic::x86_sse2_psra_w: 143 case Intrinsic::x86_avx2_psra_d: 144 case Intrinsic::x86_avx2_psra_w: 145 case Intrinsic::x86_avx512_psra_q_128: 146 case Intrinsic::x86_avx512_psra_q_256: 147 case Intrinsic::x86_avx512_psra_d_512: 148 case Intrinsic::x86_avx512_psra_q_512: 149 case Intrinsic::x86_avx512_psra_w_512: 150 LogicalShift = false; 151 ShiftLeft = false; 152 break; 153 case Intrinsic::x86_sse2_psrli_d: 154 case Intrinsic::x86_sse2_psrli_q: 155 case Intrinsic::x86_sse2_psrli_w: 156 case Intrinsic::x86_avx2_psrli_d: 157 case Intrinsic::x86_avx2_psrli_q: 158 case Intrinsic::x86_avx2_psrli_w: 159 case Intrinsic::x86_avx512_psrli_d_512: 160 case Intrinsic::x86_avx512_psrli_q_512: 161 case Intrinsic::x86_avx512_psrli_w_512: 162 IsImm = true; 163 [[fallthrough]]; 164 case Intrinsic::x86_sse2_psrl_d: 165 case Intrinsic::x86_sse2_psrl_q: 166 case Intrinsic::x86_sse2_psrl_w: 167 case Intrinsic::x86_avx2_psrl_d: 168 case Intrinsic::x86_avx2_psrl_q: 169 case Intrinsic::x86_avx2_psrl_w: 170 case Intrinsic::x86_avx512_psrl_d_512: 171 case Intrinsic::x86_avx512_psrl_q_512: 172 case Intrinsic::x86_avx512_psrl_w_512: 173 LogicalShift = true; 174 ShiftLeft = false; 175 break; 176 case Intrinsic::x86_sse2_pslli_d: 177 case Intrinsic::x86_sse2_pslli_q: 178 case Intrinsic::x86_sse2_pslli_w: 179 case Intrinsic::x86_avx2_pslli_d: 180 case Intrinsic::x86_avx2_pslli_q: 181 case Intrinsic::x86_avx2_pslli_w: 182 case Intrinsic::x86_avx512_pslli_d_512: 183 case Intrinsic::x86_avx512_pslli_q_512: 184 case Intrinsic::x86_avx512_pslli_w_512: 185 IsImm = true; 186 [[fallthrough]]; 187 case Intrinsic::x86_sse2_psll_d: 188 case Intrinsic::x86_sse2_psll_q: 189 case Intrinsic::x86_sse2_psll_w: 190 case Intrinsic::x86_avx2_psll_d: 191 case Intrinsic::x86_avx2_psll_q: 192 case Intrinsic::x86_avx2_psll_w: 193 case Intrinsic::x86_avx512_psll_d_512: 194 case Intrinsic::x86_avx512_psll_q_512: 195 case Intrinsic::x86_avx512_psll_w_512: 196 LogicalShift = true; 197 ShiftLeft = true; 198 break; 199 } 200 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 201 202 Value *Vec = II.getArgOperand(0); 203 Value *Amt = II.getArgOperand(1); 204 auto *VT = cast<FixedVectorType>(Vec->getType()); 205 Type *SVT = VT->getElementType(); 206 Type *AmtVT = Amt->getType(); 207 unsigned VWidth = VT->getNumElements(); 208 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 209 210 // If the shift amount is guaranteed to be in-range we can replace it with a 211 // generic shift. If its guaranteed to be out of range, logical shifts combine 212 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 213 if (IsImm) { 214 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 215 KnownBits KnownAmtBits = 216 llvm::computeKnownBits(Amt, II.getDataLayout()); 217 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 218 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 219 Amt = Builder.CreateVectorSplat(VWidth, Amt); 220 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 221 : Builder.CreateLShr(Vec, Amt)) 222 : Builder.CreateAShr(Vec, Amt)); 223 } 224 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 225 if (LogicalShift) 226 return ConstantAggregateZero::get(VT); 227 Amt = ConstantInt::get(SVT, BitWidth - 1); 228 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 229 } 230 } else { 231 // Ensure the first element has an in-range value and the rest of the 232 // elements in the bottom 64 bits are zero. 233 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 234 cast<VectorType>(AmtVT)->getElementType() == SVT && 235 "Unexpected shift-by-scalar type"); 236 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 237 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 238 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 239 KnownBits KnownLowerBits = llvm::computeKnownBits( 240 Amt, DemandedLower, II.getDataLayout()); 241 KnownBits KnownUpperBits = llvm::computeKnownBits( 242 Amt, DemandedUpper, II.getDataLayout()); 243 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 244 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 245 SmallVector<int, 16> ZeroSplat(VWidth, 0); 246 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 247 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 248 : Builder.CreateLShr(Vec, Amt)) 249 : Builder.CreateAShr(Vec, Amt)); 250 } 251 } 252 253 // Simplify if count is constant vector. 254 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 255 if (!CDV) 256 return nullptr; 257 258 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 259 // operand to compute the shift amount. 260 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 261 cast<VectorType>(AmtVT)->getElementType() == SVT && 262 "Unexpected shift-by-scalar type"); 263 264 // Concatenate the sub-elements to create the 64-bit value. 265 APInt Count(64, 0); 266 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 267 unsigned SubEltIdx = (NumSubElts - 1) - i; 268 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 269 Count <<= BitWidth; 270 Count |= SubElt->getValue().zextOrTrunc(64); 271 } 272 273 // If shift-by-zero then just return the original value. 274 if (Count.isZero()) 275 return Vec; 276 277 // Handle cases when Shift >= BitWidth. 278 if (Count.uge(BitWidth)) { 279 // If LogicalShift - just return zero. 280 if (LogicalShift) 281 return ConstantAggregateZero::get(VT); 282 283 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 284 Count = APInt(64, BitWidth - 1); 285 } 286 287 // Get a constant vector of the same type as the first operand. 288 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 289 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 290 291 if (ShiftLeft) 292 return Builder.CreateShl(Vec, ShiftVec); 293 294 if (LogicalShift) 295 return Builder.CreateLShr(Vec, ShiftVec); 296 297 return Builder.CreateAShr(Vec, ShiftVec); 298 } 299 300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 303 static Value *simplifyX86varShift(const IntrinsicInst &II, 304 InstCombiner::BuilderTy &Builder) { 305 bool LogicalShift = false; 306 bool ShiftLeft = false; 307 308 switch (II.getIntrinsicID()) { 309 default: 310 llvm_unreachable("Unexpected intrinsic!"); 311 case Intrinsic::x86_avx2_psrav_d: 312 case Intrinsic::x86_avx2_psrav_d_256: 313 case Intrinsic::x86_avx512_psrav_q_128: 314 case Intrinsic::x86_avx512_psrav_q_256: 315 case Intrinsic::x86_avx512_psrav_d_512: 316 case Intrinsic::x86_avx512_psrav_q_512: 317 case Intrinsic::x86_avx512_psrav_w_128: 318 case Intrinsic::x86_avx512_psrav_w_256: 319 case Intrinsic::x86_avx512_psrav_w_512: 320 LogicalShift = false; 321 ShiftLeft = false; 322 break; 323 case Intrinsic::x86_avx2_psrlv_d: 324 case Intrinsic::x86_avx2_psrlv_d_256: 325 case Intrinsic::x86_avx2_psrlv_q: 326 case Intrinsic::x86_avx2_psrlv_q_256: 327 case Intrinsic::x86_avx512_psrlv_d_512: 328 case Intrinsic::x86_avx512_psrlv_q_512: 329 case Intrinsic::x86_avx512_psrlv_w_128: 330 case Intrinsic::x86_avx512_psrlv_w_256: 331 case Intrinsic::x86_avx512_psrlv_w_512: 332 LogicalShift = true; 333 ShiftLeft = false; 334 break; 335 case Intrinsic::x86_avx2_psllv_d: 336 case Intrinsic::x86_avx2_psllv_d_256: 337 case Intrinsic::x86_avx2_psllv_q: 338 case Intrinsic::x86_avx2_psllv_q_256: 339 case Intrinsic::x86_avx512_psllv_d_512: 340 case Intrinsic::x86_avx512_psllv_q_512: 341 case Intrinsic::x86_avx512_psllv_w_128: 342 case Intrinsic::x86_avx512_psllv_w_256: 343 case Intrinsic::x86_avx512_psllv_w_512: 344 LogicalShift = true; 345 ShiftLeft = true; 346 break; 347 } 348 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 349 350 Value *Vec = II.getArgOperand(0); 351 Value *Amt = II.getArgOperand(1); 352 auto *VT = cast<FixedVectorType>(II.getType()); 353 Type *SVT = VT->getElementType(); 354 int NumElts = VT->getNumElements(); 355 int BitWidth = SVT->getIntegerBitWidth(); 356 357 // If the shift amount is guaranteed to be in-range we can replace it with a 358 // generic shift. 359 KnownBits KnownAmt = 360 llvm::computeKnownBits(Amt, II.getDataLayout()); 361 if (KnownAmt.getMaxValue().ult(BitWidth)) { 362 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 363 : Builder.CreateLShr(Vec, Amt)) 364 : Builder.CreateAShr(Vec, Amt)); 365 } 366 367 // Simplify if all shift amounts are constant/undef. 368 auto *CShift = dyn_cast<Constant>(Amt); 369 if (!CShift) 370 return nullptr; 371 372 // Collect each element's shift amount. 373 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 374 bool AnyOutOfRange = false; 375 SmallVector<int, 8> ShiftAmts; 376 for (int I = 0; I < NumElts; ++I) { 377 auto *CElt = CShift->getAggregateElement(I); 378 if (isa_and_nonnull<UndefValue>(CElt)) { 379 ShiftAmts.push_back(-1); 380 continue; 381 } 382 383 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 384 if (!COp) 385 return nullptr; 386 387 // Handle out of range shifts. 388 // If LogicalShift - set to BitWidth (special case). 389 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 390 APInt ShiftVal = COp->getValue(); 391 if (ShiftVal.uge(BitWidth)) { 392 AnyOutOfRange = LogicalShift; 393 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 394 continue; 395 } 396 397 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 398 } 399 400 // If all elements out of range or UNDEF, return vector of zeros/undefs. 401 // ArithmeticShift should only hit this if they are all UNDEF. 402 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 403 if (llvm::all_of(ShiftAmts, OutOfRange)) { 404 SmallVector<Constant *, 8> ConstantVec; 405 for (int Idx : ShiftAmts) { 406 if (Idx < 0) { 407 ConstantVec.push_back(UndefValue::get(SVT)); 408 } else { 409 assert(LogicalShift && "Logical shift expected"); 410 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 411 } 412 } 413 return ConstantVector::get(ConstantVec); 414 } 415 416 // We can't handle only some out of range values with generic logical shifts. 417 if (AnyOutOfRange) 418 return nullptr; 419 420 // Build the shift amount constant vector. 421 SmallVector<Constant *, 8> ShiftVecAmts; 422 for (int Idx : ShiftAmts) { 423 if (Idx < 0) 424 ShiftVecAmts.push_back(UndefValue::get(SVT)); 425 else 426 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 427 } 428 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 429 430 if (ShiftLeft) 431 return Builder.CreateShl(Vec, ShiftVec); 432 433 if (LogicalShift) 434 return Builder.CreateLShr(Vec, ShiftVec); 435 436 return Builder.CreateAShr(Vec, ShiftVec); 437 } 438 439 static Value *simplifyX86pack(IntrinsicInst &II, 440 InstCombiner::BuilderTy &Builder, bool IsSigned) { 441 Value *Arg0 = II.getArgOperand(0); 442 Value *Arg1 = II.getArgOperand(1); 443 Type *ResTy = II.getType(); 444 445 // Fast all undef handling. 446 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 447 return UndefValue::get(ResTy); 448 449 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 450 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 451 unsigned NumSrcElts = ArgTy->getNumElements(); 452 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 453 "Unexpected packing types"); 454 455 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 456 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 457 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 458 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 459 "Unexpected packing types"); 460 461 // Constant folding. 462 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 463 return nullptr; 464 465 // Clamp Values - signed/unsigned both use signed clamp values, but they 466 // differ on the min/max values. 467 APInt MinValue, MaxValue; 468 if (IsSigned) { 469 // PACKSS: Truncate signed value with signed saturation. 470 // Source values less than dst minint are saturated to minint. 471 // Source values greater than dst maxint are saturated to maxint. 472 MinValue = 473 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 474 MaxValue = 475 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 476 } else { 477 // PACKUS: Truncate signed value with unsigned saturation. 478 // Source values less than zero are saturated to zero. 479 // Source values greater than dst maxuint are saturated to maxuint. 480 MinValue = APInt::getZero(SrcScalarSizeInBits); 481 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 482 } 483 484 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 485 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 488 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 489 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 490 491 // Shuffle clamped args together at the lane level. 492 SmallVector<int, 32> PackMask; 493 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 496 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 497 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 498 } 499 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 500 501 // Truncate to dst size. 502 return Builder.CreateTrunc(Shuffle, ResTy); 503 } 504 505 static Value *simplifyX86pmadd(IntrinsicInst &II, 506 InstCombiner::BuilderTy &Builder, 507 bool IsPMADDWD) { 508 Value *Arg0 = II.getArgOperand(0); 509 Value *Arg1 = II.getArgOperand(1); 510 auto *ResTy = cast<FixedVectorType>(II.getType()); 511 [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 512 513 unsigned NumDstElts = ResTy->getNumElements(); 514 assert(ArgTy->getNumElements() == (2 * NumDstElts) && 515 ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) && 516 "Unexpected PMADD types"); 517 518 // Multiply by zero. 519 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) 520 return ConstantAggregateZero::get(ResTy); 521 522 // Constant folding. 523 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 524 return nullptr; 525 526 // Split Lo/Hi elements pairs, extend and add together. 527 // PMADDWD(X,Y) = 528 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1]))) 529 // PMADDUBSW(X,Y) = 530 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1]))) 531 SmallVector<int> LoMask, HiMask; 532 for (unsigned I = 0; I != NumDstElts; ++I) { 533 LoMask.push_back(2 * I + 0); 534 HiMask.push_back(2 * I + 1); 535 } 536 537 auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask); 538 auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask); 539 auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask); 540 auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask); 541 542 auto LHSCast = 543 IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt; 544 LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy); 545 LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy); 546 RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy); 547 RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy); 548 Value *Lo = Builder.CreateMul(LHSLo, RHSLo); 549 Value *Hi = Builder.CreateMul(LHSHi, RHSHi); 550 return IsPMADDWD 551 ? Builder.CreateAdd(Lo, Hi) 552 : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi}); 553 } 554 555 static Value *simplifyX86movmsk(const IntrinsicInst &II, 556 InstCombiner::BuilderTy &Builder) { 557 Value *Arg = II.getArgOperand(0); 558 Type *ResTy = II.getType(); 559 560 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 561 if (isa<UndefValue>(Arg)) 562 return Constant::getNullValue(ResTy); 563 564 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 565 // We can't easily peek through x86_mmx types. 566 if (!ArgTy) 567 return nullptr; 568 569 // Expand MOVMSK to compare/bitcast/zext: 570 // e.g. PMOVMSKB(v16i8 x): 571 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 572 // %int = bitcast <16 x i1> %cmp to i16 573 // %res = zext i16 %int to i32 574 unsigned NumElts = ArgTy->getNumElements(); 575 Type *IntegerTy = Builder.getIntNTy(NumElts); 576 577 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 578 Res = Builder.CreateIsNeg(Res); 579 Res = Builder.CreateBitCast(Res, IntegerTy); 580 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 581 return Res; 582 } 583 584 static Value *simplifyX86addcarry(const IntrinsicInst &II, 585 InstCombiner::BuilderTy &Builder) { 586 Value *CarryIn = II.getArgOperand(0); 587 Value *Op1 = II.getArgOperand(1); 588 Value *Op2 = II.getArgOperand(2); 589 Type *RetTy = II.getType(); 590 Type *OpTy = Op1->getType(); 591 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 592 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 593 "Unexpected types for x86 addcarry"); 594 595 // If carry-in is zero, this is just an unsigned add with overflow. 596 if (match(CarryIn, PatternMatch::m_ZeroInt())) { 597 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 598 {Op1, Op2}); 599 // The types have to be adjusted to match the x86 call types. 600 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 601 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 602 Builder.getInt8Ty()); 603 Value *Res = PoisonValue::get(RetTy); 604 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 605 return Builder.CreateInsertValue(Res, UAddResult, 1); 606 } 607 608 return nullptr; 609 } 610 611 static Value *simplifyTernarylogic(const IntrinsicInst &II, 612 InstCombiner::BuilderTy &Builder) { 613 614 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3)); 615 if (!ArgImm || ArgImm->getValue().uge(256)) 616 return nullptr; 617 618 Value *ArgA = II.getArgOperand(0); 619 Value *ArgB = II.getArgOperand(1); 620 Value *ArgC = II.getArgOperand(2); 621 622 Type *Ty = II.getType(); 623 624 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 625 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second}; 626 }; 627 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 628 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second}; 629 }; 630 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 631 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second}; 632 }; 633 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> { 634 return {Builder.CreateNot(V.first), ~V.second}; 635 }; 636 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); }; 637 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); }; 638 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); }; 639 640 bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant()); 641 bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant()); 642 bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant()); 643 644 bool ABIsConst = AIsConst && BIsConst; 645 bool ACIsConst = AIsConst && CIsConst; 646 bool BCIsConst = BIsConst && CIsConst; 647 bool ABCIsConst = AIsConst && BIsConst && CIsConst; 648 649 // Use for verification. Its a big table. Its difficult to go from Imm -> 650 // logic ops, but easy to verify that a set of logic ops is correct. We track 651 // the logic ops through the second value in the pair. At the end it should 652 // equal Imm. 653 std::pair<Value *, uint8_t> A = {ArgA, 0xf0}; 654 std::pair<Value *, uint8_t> B = {ArgB, 0xcc}; 655 std::pair<Value *, uint8_t> C = {ArgC, 0xaa}; 656 std::pair<Value *, uint8_t> Res = {nullptr, 0}; 657 658 // Currently we only handle cases that convert directly to another instruction 659 // or cases where all the ops are constant. This is because we don't properly 660 // handle creating ternary ops in the backend, so splitting them here may 661 // cause regressions. As the backend improves, uncomment more cases. 662 663 uint8_t Imm = ArgImm->getValue().getZExtValue(); 664 switch (Imm) { 665 case 0x0: 666 Res = {Constant::getNullValue(Ty), 0}; 667 break; 668 case 0x1: 669 if (ABCIsConst) 670 Res = Nor(Or(A, B), C); 671 break; 672 case 0x2: 673 if (ABCIsConst) 674 Res = And(Nor(A, B), C); 675 break; 676 case 0x3: 677 if (ABIsConst) 678 Res = Nor(A, B); 679 break; 680 case 0x4: 681 if (ABCIsConst) 682 Res = And(Nor(A, C), B); 683 break; 684 case 0x5: 685 if (ACIsConst) 686 Res = Nor(A, C); 687 break; 688 case 0x6: 689 if (ABCIsConst) 690 Res = Nor(A, Xnor(B, C)); 691 break; 692 case 0x7: 693 if (ABCIsConst) 694 Res = Nor(A, And(B, C)); 695 break; 696 case 0x8: 697 if (ABCIsConst) 698 Res = Nor(A, Nand(B, C)); 699 break; 700 case 0x9: 701 if (ABCIsConst) 702 Res = Nor(A, Xor(B, C)); 703 break; 704 case 0xa: 705 if (ACIsConst) 706 Res = Nor(A, Not(C)); 707 break; 708 case 0xb: 709 if (ABCIsConst) 710 Res = Nor(A, Nor(C, Not(B))); 711 break; 712 case 0xc: 713 if (ABIsConst) 714 Res = Nor(A, Not(B)); 715 break; 716 case 0xd: 717 if (ABCIsConst) 718 Res = Nor(A, Nor(B, Not(C))); 719 break; 720 case 0xe: 721 if (ABCIsConst) 722 Res = Nor(A, Nor(B, C)); 723 break; 724 case 0xf: 725 Res = Not(A); 726 break; 727 case 0x10: 728 if (ABCIsConst) 729 Res = And(A, Nor(B, C)); 730 break; 731 case 0x11: 732 if (BCIsConst) 733 Res = Nor(B, C); 734 break; 735 case 0x12: 736 if (ABCIsConst) 737 Res = Nor(Xnor(A, C), B); 738 break; 739 case 0x13: 740 if (ABCIsConst) 741 Res = Nor(And(A, C), B); 742 break; 743 case 0x14: 744 if (ABCIsConst) 745 Res = Nor(Xnor(A, B), C); 746 break; 747 case 0x15: 748 if (ABCIsConst) 749 Res = Nor(And(A, B), C); 750 break; 751 case 0x16: 752 if (ABCIsConst) 753 Res = Xor(Xor(A, B), And(Nand(A, B), C)); 754 break; 755 case 0x17: 756 if (ABCIsConst) 757 Res = Xor(Or(A, B), Or(Xnor(A, B), C)); 758 break; 759 case 0x18: 760 if (ABCIsConst) 761 Res = Nor(Xnor(A, B), Xnor(A, C)); 762 break; 763 case 0x19: 764 if (ABCIsConst) 765 Res = And(Nand(A, B), Xnor(B, C)); 766 break; 767 case 0x1a: 768 if (ABCIsConst) 769 Res = Xor(A, Or(And(A, B), C)); 770 break; 771 case 0x1b: 772 if (ABCIsConst) 773 Res = Xor(A, Or(Xnor(A, B), C)); 774 break; 775 case 0x1c: 776 if (ABCIsConst) 777 Res = Xor(A, Or(And(A, C), B)); 778 break; 779 case 0x1d: 780 if (ABCIsConst) 781 Res = Xor(A, Or(Xnor(A, C), B)); 782 break; 783 case 0x1e: 784 if (ABCIsConst) 785 Res = Xor(A, Or(B, C)); 786 break; 787 case 0x1f: 788 if (ABCIsConst) 789 Res = Nand(A, Or(B, C)); 790 break; 791 case 0x20: 792 if (ABCIsConst) 793 Res = Nor(Nand(A, C), B); 794 break; 795 case 0x21: 796 if (ABCIsConst) 797 Res = Nor(Xor(A, C), B); 798 break; 799 case 0x22: 800 if (BCIsConst) 801 Res = Nor(B, Not(C)); 802 break; 803 case 0x23: 804 if (ABCIsConst) 805 Res = Nor(B, Nor(C, Not(A))); 806 break; 807 case 0x24: 808 if (ABCIsConst) 809 Res = Nor(Xnor(A, B), Xor(A, C)); 810 break; 811 case 0x25: 812 if (ABCIsConst) 813 Res = Xor(A, Nand(Nand(A, B), C)); 814 break; 815 case 0x26: 816 if (ABCIsConst) 817 Res = And(Nand(A, B), Xor(B, C)); 818 break; 819 case 0x27: 820 if (ABCIsConst) 821 Res = Xor(Or(Xnor(A, B), C), B); 822 break; 823 case 0x28: 824 if (ABCIsConst) 825 Res = And(Xor(A, B), C); 826 break; 827 case 0x29: 828 if (ABCIsConst) 829 Res = Xor(Xor(A, B), Nor(And(A, B), C)); 830 break; 831 case 0x2a: 832 if (ABCIsConst) 833 Res = And(Nand(A, B), C); 834 break; 835 case 0x2b: 836 if (ABCIsConst) 837 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A); 838 break; 839 case 0x2c: 840 if (ABCIsConst) 841 Res = Nor(Xnor(A, B), Nor(B, C)); 842 break; 843 case 0x2d: 844 if (ABCIsConst) 845 Res = Xor(A, Or(B, Not(C))); 846 break; 847 case 0x2e: 848 if (ABCIsConst) 849 Res = Xor(A, Or(Xor(A, C), B)); 850 break; 851 case 0x2f: 852 if (ABCIsConst) 853 Res = Nand(A, Or(B, Not(C))); 854 break; 855 case 0x30: 856 if (ABIsConst) 857 Res = Nor(B, Not(A)); 858 break; 859 case 0x31: 860 if (ABCIsConst) 861 Res = Nor(Nor(A, Not(C)), B); 862 break; 863 case 0x32: 864 if (ABCIsConst) 865 Res = Nor(Nor(A, C), B); 866 break; 867 case 0x33: 868 Res = Not(B); 869 break; 870 case 0x34: 871 if (ABCIsConst) 872 Res = And(Xor(A, B), Nand(B, C)); 873 break; 874 case 0x35: 875 if (ABCIsConst) 876 Res = Xor(B, Or(A, Xnor(B, C))); 877 break; 878 case 0x36: 879 if (ABCIsConst) 880 Res = Xor(Or(A, C), B); 881 break; 882 case 0x37: 883 if (ABCIsConst) 884 Res = Nand(Or(A, C), B); 885 break; 886 case 0x38: 887 if (ABCIsConst) 888 Res = Nor(Xnor(A, B), Nor(A, C)); 889 break; 890 case 0x39: 891 if (ABCIsConst) 892 Res = Xor(Or(A, Not(C)), B); 893 break; 894 case 0x3a: 895 if (ABCIsConst) 896 Res = Xor(B, Or(A, Xor(B, C))); 897 break; 898 case 0x3b: 899 if (ABCIsConst) 900 Res = Nand(Or(A, Not(C)), B); 901 break; 902 case 0x3c: 903 Res = Xor(A, B); 904 break; 905 case 0x3d: 906 if (ABCIsConst) 907 Res = Xor(A, Or(Nor(A, C), B)); 908 break; 909 case 0x3e: 910 if (ABCIsConst) 911 Res = Xor(A, Or(Nor(A, Not(C)), B)); 912 break; 913 case 0x3f: 914 if (ABIsConst) 915 Res = Nand(A, B); 916 break; 917 case 0x40: 918 if (ABCIsConst) 919 Res = Nor(Nand(A, B), C); 920 break; 921 case 0x41: 922 if (ABCIsConst) 923 Res = Nor(Xor(A, B), C); 924 break; 925 case 0x42: 926 if (ABCIsConst) 927 Res = Nor(Xor(A, B), Xnor(A, C)); 928 break; 929 case 0x43: 930 if (ABCIsConst) 931 Res = Xor(A, Nand(Nand(A, C), B)); 932 break; 933 case 0x44: 934 if (BCIsConst) 935 Res = Nor(C, Not(B)); 936 break; 937 case 0x45: 938 if (ABCIsConst) 939 Res = Nor(Nor(B, Not(A)), C); 940 break; 941 case 0x46: 942 if (ABCIsConst) 943 Res = Xor(Or(And(A, C), B), C); 944 break; 945 case 0x47: 946 if (ABCIsConst) 947 Res = Xor(Or(Xnor(A, C), B), C); 948 break; 949 case 0x48: 950 if (ABCIsConst) 951 Res = And(Xor(A, C), B); 952 break; 953 case 0x49: 954 if (ABCIsConst) 955 Res = Xor(Or(Xnor(A, B), And(A, C)), C); 956 break; 957 case 0x4a: 958 if (ABCIsConst) 959 Res = Nor(Xnor(A, C), Nor(B, C)); 960 break; 961 case 0x4b: 962 if (ABCIsConst) 963 Res = Xor(A, Or(C, Not(B))); 964 break; 965 case 0x4c: 966 if (ABCIsConst) 967 Res = And(Nand(A, C), B); 968 break; 969 case 0x4d: 970 if (ABCIsConst) 971 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A); 972 break; 973 case 0x4e: 974 if (ABCIsConst) 975 Res = Xor(A, Or(Xor(A, B), C)); 976 break; 977 case 0x4f: 978 if (ABCIsConst) 979 Res = Nand(A, Nand(B, Not(C))); 980 break; 981 case 0x50: 982 if (ACIsConst) 983 Res = Nor(C, Not(A)); 984 break; 985 case 0x51: 986 if (ABCIsConst) 987 Res = Nor(Nor(A, Not(B)), C); 988 break; 989 case 0x52: 990 if (ABCIsConst) 991 Res = And(Xor(A, C), Nand(B, C)); 992 break; 993 case 0x53: 994 if (ABCIsConst) 995 Res = Xor(Or(Xnor(B, C), A), C); 996 break; 997 case 0x54: 998 if (ABCIsConst) 999 Res = Nor(Nor(A, B), C); 1000 break; 1001 case 0x55: 1002 Res = Not(C); 1003 break; 1004 case 0x56: 1005 if (ABCIsConst) 1006 Res = Xor(Or(A, B), C); 1007 break; 1008 case 0x57: 1009 if (ABCIsConst) 1010 Res = Nand(Or(A, B), C); 1011 break; 1012 case 0x58: 1013 if (ABCIsConst) 1014 Res = Nor(Nor(A, B), Xnor(A, C)); 1015 break; 1016 case 0x59: 1017 if (ABCIsConst) 1018 Res = Xor(Or(A, Not(B)), C); 1019 break; 1020 case 0x5a: 1021 Res = Xor(A, C); 1022 break; 1023 case 0x5b: 1024 if (ABCIsConst) 1025 Res = Xor(A, Or(Nor(A, B), C)); 1026 break; 1027 case 0x5c: 1028 if (ABCIsConst) 1029 Res = Xor(Or(Xor(B, C), A), C); 1030 break; 1031 case 0x5d: 1032 if (ABCIsConst) 1033 Res = Nand(Or(A, Not(B)), C); 1034 break; 1035 case 0x5e: 1036 if (ABCIsConst) 1037 Res = Xor(A, Or(Nor(A, Not(B)), C)); 1038 break; 1039 case 0x5f: 1040 if (ACIsConst) 1041 Res = Nand(A, C); 1042 break; 1043 case 0x60: 1044 if (ABCIsConst) 1045 Res = And(A, Xor(B, C)); 1046 break; 1047 case 0x61: 1048 if (ABCIsConst) 1049 Res = Xor(Or(Xnor(A, B), And(B, C)), C); 1050 break; 1051 case 0x62: 1052 if (ABCIsConst) 1053 Res = Nor(Nor(A, C), Xnor(B, C)); 1054 break; 1055 case 0x63: 1056 if (ABCIsConst) 1057 Res = Xor(B, Or(C, Not(A))); 1058 break; 1059 case 0x64: 1060 if (ABCIsConst) 1061 Res = Nor(Nor(A, B), Xnor(B, C)); 1062 break; 1063 case 0x65: 1064 if (ABCIsConst) 1065 Res = Xor(Or(B, Not(A)), C); 1066 break; 1067 case 0x66: 1068 Res = Xor(B, C); 1069 break; 1070 case 0x67: 1071 if (ABCIsConst) 1072 Res = Or(Nor(A, B), Xor(B, C)); 1073 break; 1074 case 0x68: 1075 if (ABCIsConst) 1076 Res = Xor(Xor(A, B), Nor(Nor(A, B), C)); 1077 break; 1078 case 0x69: 1079 if (ABCIsConst) 1080 Res = Xor(Xnor(A, B), C); 1081 break; 1082 case 0x6a: 1083 if (ABCIsConst) 1084 Res = Xor(And(A, B), C); 1085 break; 1086 case 0x6b: 1087 if (ABCIsConst) 1088 Res = Or(Nor(A, B), Xor(Xnor(A, B), C)); 1089 break; 1090 case 0x6c: 1091 if (ABCIsConst) 1092 Res = Xor(And(A, C), B); 1093 break; 1094 case 0x6d: 1095 if (ABCIsConst) 1096 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C); 1097 break; 1098 case 0x6e: 1099 if (ABCIsConst) 1100 Res = Or(Nor(A, Not(B)), Xor(B, C)); 1101 break; 1102 case 0x6f: 1103 if (ABCIsConst) 1104 Res = Nand(A, Xnor(B, C)); 1105 break; 1106 case 0x70: 1107 if (ABCIsConst) 1108 Res = And(A, Nand(B, C)); 1109 break; 1110 case 0x71: 1111 if (ABCIsConst) 1112 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A); 1113 break; 1114 case 0x72: 1115 if (ABCIsConst) 1116 Res = Xor(Or(Xor(A, B), C), B); 1117 break; 1118 case 0x73: 1119 if (ABCIsConst) 1120 Res = Nand(Nand(A, Not(C)), B); 1121 break; 1122 case 0x74: 1123 if (ABCIsConst) 1124 Res = Xor(Or(Xor(A, C), B), C); 1125 break; 1126 case 0x75: 1127 if (ABCIsConst) 1128 Res = Nand(Nand(A, Not(B)), C); 1129 break; 1130 case 0x76: 1131 if (ABCIsConst) 1132 Res = Xor(B, Or(Nor(B, Not(A)), C)); 1133 break; 1134 case 0x77: 1135 if (BCIsConst) 1136 Res = Nand(B, C); 1137 break; 1138 case 0x78: 1139 if (ABCIsConst) 1140 Res = Xor(A, And(B, C)); 1141 break; 1142 case 0x79: 1143 if (ABCIsConst) 1144 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C); 1145 break; 1146 case 0x7a: 1147 if (ABCIsConst) 1148 Res = Or(Xor(A, C), Nor(B, Not(A))); 1149 break; 1150 case 0x7b: 1151 if (ABCIsConst) 1152 Res = Nand(Xnor(A, C), B); 1153 break; 1154 case 0x7c: 1155 if (ABCIsConst) 1156 Res = Or(Xor(A, B), Nor(C, Not(A))); 1157 break; 1158 case 0x7d: 1159 if (ABCIsConst) 1160 Res = Nand(Xnor(A, B), C); 1161 break; 1162 case 0x7e: 1163 if (ABCIsConst) 1164 Res = Or(Xor(A, B), Xor(A, C)); 1165 break; 1166 case 0x7f: 1167 if (ABCIsConst) 1168 Res = Nand(And(A, B), C); 1169 break; 1170 case 0x80: 1171 if (ABCIsConst) 1172 Res = And(And(A, B), C); 1173 break; 1174 case 0x81: 1175 if (ABCIsConst) 1176 Res = Nor(Xor(A, B), Xor(A, C)); 1177 break; 1178 case 0x82: 1179 if (ABCIsConst) 1180 Res = And(Xnor(A, B), C); 1181 break; 1182 case 0x83: 1183 if (ABCIsConst) 1184 Res = Nor(Xor(A, B), Nor(C, Not(A))); 1185 break; 1186 case 0x84: 1187 if (ABCIsConst) 1188 Res = And(Xnor(A, C), B); 1189 break; 1190 case 0x85: 1191 if (ABCIsConst) 1192 Res = Nor(Xor(A, C), Nor(B, Not(A))); 1193 break; 1194 case 0x86: 1195 if (ABCIsConst) 1196 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C); 1197 break; 1198 case 0x87: 1199 if (ABCIsConst) 1200 Res = Xor(A, Nand(B, C)); 1201 break; 1202 case 0x88: 1203 Res = And(B, C); 1204 break; 1205 case 0x89: 1206 if (ABCIsConst) 1207 Res = Xor(B, Nor(Nor(B, Not(A)), C)); 1208 break; 1209 case 0x8a: 1210 if (ABCIsConst) 1211 Res = And(Nand(A, Not(B)), C); 1212 break; 1213 case 0x8b: 1214 if (ABCIsConst) 1215 Res = Xor(Nor(Xor(A, C), B), C); 1216 break; 1217 case 0x8c: 1218 if (ABCIsConst) 1219 Res = And(Nand(A, Not(C)), B); 1220 break; 1221 case 0x8d: 1222 if (ABCIsConst) 1223 Res = Xor(Nor(Xor(A, B), C), B); 1224 break; 1225 case 0x8e: 1226 if (ABCIsConst) 1227 Res = Xor(Or(Xor(A, B), Xor(A, C)), A); 1228 break; 1229 case 0x8f: 1230 if (ABCIsConst) 1231 Res = Nand(A, Nand(B, C)); 1232 break; 1233 case 0x90: 1234 if (ABCIsConst) 1235 Res = And(A, Xnor(B, C)); 1236 break; 1237 case 0x91: 1238 if (ABCIsConst) 1239 Res = Nor(Nor(A, Not(B)), Xor(B, C)); 1240 break; 1241 case 0x92: 1242 if (ABCIsConst) 1243 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C); 1244 break; 1245 case 0x93: 1246 if (ABCIsConst) 1247 Res = Xor(Nand(A, C), B); 1248 break; 1249 case 0x94: 1250 if (ABCIsConst) 1251 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C)); 1252 break; 1253 case 0x95: 1254 if (ABCIsConst) 1255 Res = Xor(Nand(A, B), C); 1256 break; 1257 case 0x96: 1258 if (ABCIsConst) 1259 Res = Xor(Xor(A, B), C); 1260 break; 1261 case 0x97: 1262 if (ABCIsConst) 1263 Res = Xor(Xor(A, B), Or(Nor(A, B), C)); 1264 break; 1265 case 0x98: 1266 if (ABCIsConst) 1267 Res = Nor(Nor(A, B), Xor(B, C)); 1268 break; 1269 case 0x99: 1270 if (BCIsConst) 1271 Res = Xnor(B, C); 1272 break; 1273 case 0x9a: 1274 if (ABCIsConst) 1275 Res = Xor(Nor(B, Not(A)), C); 1276 break; 1277 case 0x9b: 1278 if (ABCIsConst) 1279 Res = Or(Nor(A, B), Xnor(B, C)); 1280 break; 1281 case 0x9c: 1282 if (ABCIsConst) 1283 Res = Xor(B, Nor(C, Not(A))); 1284 break; 1285 case 0x9d: 1286 if (ABCIsConst) 1287 Res = Or(Nor(A, C), Xnor(B, C)); 1288 break; 1289 case 0x9e: 1290 if (ABCIsConst) 1291 Res = Xor(And(Xor(A, B), Nand(B, C)), C); 1292 break; 1293 case 0x9f: 1294 if (ABCIsConst) 1295 Res = Nand(A, Xor(B, C)); 1296 break; 1297 case 0xa0: 1298 Res = And(A, C); 1299 break; 1300 case 0xa1: 1301 if (ABCIsConst) 1302 Res = Xor(A, Nor(Nor(A, Not(B)), C)); 1303 break; 1304 case 0xa2: 1305 if (ABCIsConst) 1306 Res = And(Or(A, Not(B)), C); 1307 break; 1308 case 0xa3: 1309 if (ABCIsConst) 1310 Res = Xor(Nor(Xor(B, C), A), C); 1311 break; 1312 case 0xa4: 1313 if (ABCIsConst) 1314 Res = Xor(A, Nor(Nor(A, B), C)); 1315 break; 1316 case 0xa5: 1317 if (ACIsConst) 1318 Res = Xnor(A, C); 1319 break; 1320 case 0xa6: 1321 if (ABCIsConst) 1322 Res = Xor(Nor(A, Not(B)), C); 1323 break; 1324 case 0xa7: 1325 if (ABCIsConst) 1326 Res = Or(Nor(A, B), Xnor(A, C)); 1327 break; 1328 case 0xa8: 1329 if (ABCIsConst) 1330 Res = And(Or(A, B), C); 1331 break; 1332 case 0xa9: 1333 if (ABCIsConst) 1334 Res = Xor(Nor(A, B), C); 1335 break; 1336 case 0xaa: 1337 Res = C; 1338 break; 1339 case 0xab: 1340 if (ABCIsConst) 1341 Res = Or(Nor(A, B), C); 1342 break; 1343 case 0xac: 1344 if (ABCIsConst) 1345 Res = Xor(Nor(Xnor(B, C), A), C); 1346 break; 1347 case 0xad: 1348 if (ABCIsConst) 1349 Res = Or(Xnor(A, C), And(B, C)); 1350 break; 1351 case 0xae: 1352 if (ABCIsConst) 1353 Res = Or(Nor(A, Not(B)), C); 1354 break; 1355 case 0xaf: 1356 if (ACIsConst) 1357 Res = Or(C, Not(A)); 1358 break; 1359 case 0xb0: 1360 if (ABCIsConst) 1361 Res = And(A, Nand(B, Not(C))); 1362 break; 1363 case 0xb1: 1364 if (ABCIsConst) 1365 Res = Xor(A, Nor(Xor(A, B), C)); 1366 break; 1367 case 0xb2: 1368 if (ABCIsConst) 1369 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A); 1370 break; 1371 case 0xb3: 1372 if (ABCIsConst) 1373 Res = Nand(Nand(A, C), B); 1374 break; 1375 case 0xb4: 1376 if (ABCIsConst) 1377 Res = Xor(A, Nor(C, Not(B))); 1378 break; 1379 case 0xb5: 1380 if (ABCIsConst) 1381 Res = Or(Xnor(A, C), Nor(B, C)); 1382 break; 1383 case 0xb6: 1384 if (ABCIsConst) 1385 Res = Xor(And(Xor(A, B), Nand(A, C)), C); 1386 break; 1387 case 0xb7: 1388 if (ABCIsConst) 1389 Res = Nand(Xor(A, C), B); 1390 break; 1391 case 0xb8: 1392 if (ABCIsConst) 1393 Res = Xor(Nor(Xnor(A, C), B), C); 1394 break; 1395 case 0xb9: 1396 if (ABCIsConst) 1397 Res = Xor(Nor(And(A, C), B), C); 1398 break; 1399 case 0xba: 1400 if (ABCIsConst) 1401 Res = Or(Nor(B, Not(A)), C); 1402 break; 1403 case 0xbb: 1404 if (BCIsConst) 1405 Res = Or(C, Not(B)); 1406 break; 1407 case 0xbc: 1408 if (ABCIsConst) 1409 Res = Xor(A, And(Nand(A, C), B)); 1410 break; 1411 case 0xbd: 1412 if (ABCIsConst) 1413 Res = Or(Xor(A, B), Xnor(A, C)); 1414 break; 1415 case 0xbe: 1416 if (ABCIsConst) 1417 Res = Or(Xor(A, B), C); 1418 break; 1419 case 0xbf: 1420 if (ABCIsConst) 1421 Res = Or(Nand(A, B), C); 1422 break; 1423 case 0xc0: 1424 Res = And(A, B); 1425 break; 1426 case 0xc1: 1427 if (ABCIsConst) 1428 Res = Xor(A, Nor(Nor(A, Not(C)), B)); 1429 break; 1430 case 0xc2: 1431 if (ABCIsConst) 1432 Res = Xor(A, Nor(Nor(A, C), B)); 1433 break; 1434 case 0xc3: 1435 if (ABIsConst) 1436 Res = Xnor(A, B); 1437 break; 1438 case 0xc4: 1439 if (ABCIsConst) 1440 Res = And(Or(A, Not(C)), B); 1441 break; 1442 case 0xc5: 1443 if (ABCIsConst) 1444 Res = Xor(B, Nor(A, Xor(B, C))); 1445 break; 1446 case 0xc6: 1447 if (ABCIsConst) 1448 Res = Xor(Nor(A, Not(C)), B); 1449 break; 1450 case 0xc7: 1451 if (ABCIsConst) 1452 Res = Or(Xnor(A, B), Nor(A, C)); 1453 break; 1454 case 0xc8: 1455 if (ABCIsConst) 1456 Res = And(Or(A, C), B); 1457 break; 1458 case 0xc9: 1459 if (ABCIsConst) 1460 Res = Xor(Nor(A, C), B); 1461 break; 1462 case 0xca: 1463 if (ABCIsConst) 1464 Res = Xor(B, Nor(A, Xnor(B, C))); 1465 break; 1466 case 0xcb: 1467 if (ABCIsConst) 1468 Res = Or(Xnor(A, B), And(B, C)); 1469 break; 1470 case 0xcc: 1471 Res = B; 1472 break; 1473 case 0xcd: 1474 if (ABCIsConst) 1475 Res = Or(Nor(A, C), B); 1476 break; 1477 case 0xce: 1478 if (ABCIsConst) 1479 Res = Or(Nor(A, Not(C)), B); 1480 break; 1481 case 0xcf: 1482 if (ABIsConst) 1483 Res = Or(B, Not(A)); 1484 break; 1485 case 0xd0: 1486 if (ABCIsConst) 1487 Res = And(A, Or(B, Not(C))); 1488 break; 1489 case 0xd1: 1490 if (ABCIsConst) 1491 Res = Xor(A, Nor(Xor(A, C), B)); 1492 break; 1493 case 0xd2: 1494 if (ABCIsConst) 1495 Res = Xor(A, Nor(B, Not(C))); 1496 break; 1497 case 0xd3: 1498 if (ABCIsConst) 1499 Res = Or(Xnor(A, B), Nor(B, C)); 1500 break; 1501 case 0xd4: 1502 if (ABCIsConst) 1503 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A); 1504 break; 1505 case 0xd5: 1506 if (ABCIsConst) 1507 Res = Nand(Nand(A, B), C); 1508 break; 1509 case 0xd6: 1510 if (ABCIsConst) 1511 Res = Xor(Xor(A, B), Or(And(A, B), C)); 1512 break; 1513 case 0xd7: 1514 if (ABCIsConst) 1515 Res = Nand(Xor(A, B), C); 1516 break; 1517 case 0xd8: 1518 if (ABCIsConst) 1519 Res = Xor(Nor(Xnor(A, B), C), B); 1520 break; 1521 case 0xd9: 1522 if (ABCIsConst) 1523 Res = Or(And(A, B), Xnor(B, C)); 1524 break; 1525 case 0xda: 1526 if (ABCIsConst) 1527 Res = Xor(A, And(Nand(A, B), C)); 1528 break; 1529 case 0xdb: 1530 if (ABCIsConst) 1531 Res = Or(Xnor(A, B), Xor(A, C)); 1532 break; 1533 case 0xdc: 1534 if (ABCIsConst) 1535 Res = Or(B, Nor(C, Not(A))); 1536 break; 1537 case 0xdd: 1538 if (BCIsConst) 1539 Res = Or(B, Not(C)); 1540 break; 1541 case 0xde: 1542 if (ABCIsConst) 1543 Res = Or(Xor(A, C), B); 1544 break; 1545 case 0xdf: 1546 if (ABCIsConst) 1547 Res = Or(Nand(A, C), B); 1548 break; 1549 case 0xe0: 1550 if (ABCIsConst) 1551 Res = And(A, Or(B, C)); 1552 break; 1553 case 0xe1: 1554 if (ABCIsConst) 1555 Res = Xor(A, Nor(B, C)); 1556 break; 1557 case 0xe2: 1558 if (ABCIsConst) 1559 Res = Xor(A, Nor(Xnor(A, C), B)); 1560 break; 1561 case 0xe3: 1562 if (ABCIsConst) 1563 Res = Xor(A, Nor(And(A, C), B)); 1564 break; 1565 case 0xe4: 1566 if (ABCIsConst) 1567 Res = Xor(A, Nor(Xnor(A, B), C)); 1568 break; 1569 case 0xe5: 1570 if (ABCIsConst) 1571 Res = Xor(A, Nor(And(A, B), C)); 1572 break; 1573 case 0xe6: 1574 if (ABCIsConst) 1575 Res = Or(And(A, B), Xor(B, C)); 1576 break; 1577 case 0xe7: 1578 if (ABCIsConst) 1579 Res = Or(Xnor(A, B), Xnor(A, C)); 1580 break; 1581 case 0xe8: 1582 if (ABCIsConst) 1583 Res = Xor(Or(A, B), Nor(Xnor(A, B), C)); 1584 break; 1585 case 0xe9: 1586 if (ABCIsConst) 1587 Res = Xor(Xor(A, B), Nand(Nand(A, B), C)); 1588 break; 1589 case 0xea: 1590 if (ABCIsConst) 1591 Res = Or(And(A, B), C); 1592 break; 1593 case 0xeb: 1594 if (ABCIsConst) 1595 Res = Or(Xnor(A, B), C); 1596 break; 1597 case 0xec: 1598 if (ABCIsConst) 1599 Res = Or(And(A, C), B); 1600 break; 1601 case 0xed: 1602 if (ABCIsConst) 1603 Res = Or(Xnor(A, C), B); 1604 break; 1605 case 0xee: 1606 Res = Or(B, C); 1607 break; 1608 case 0xef: 1609 if (ABCIsConst) 1610 Res = Nand(A, Nor(B, C)); 1611 break; 1612 case 0xf0: 1613 Res = A; 1614 break; 1615 case 0xf1: 1616 if (ABCIsConst) 1617 Res = Or(A, Nor(B, C)); 1618 break; 1619 case 0xf2: 1620 if (ABCIsConst) 1621 Res = Or(A, Nor(B, Not(C))); 1622 break; 1623 case 0xf3: 1624 if (ABIsConst) 1625 Res = Or(A, Not(B)); 1626 break; 1627 case 0xf4: 1628 if (ABCIsConst) 1629 Res = Or(A, Nor(C, Not(B))); 1630 break; 1631 case 0xf5: 1632 if (ACIsConst) 1633 Res = Or(A, Not(C)); 1634 break; 1635 case 0xf6: 1636 if (ABCIsConst) 1637 Res = Or(A, Xor(B, C)); 1638 break; 1639 case 0xf7: 1640 if (ABCIsConst) 1641 Res = Or(A, Nand(B, C)); 1642 break; 1643 case 0xf8: 1644 if (ABCIsConst) 1645 Res = Or(A, And(B, C)); 1646 break; 1647 case 0xf9: 1648 if (ABCIsConst) 1649 Res = Or(A, Xnor(B, C)); 1650 break; 1651 case 0xfa: 1652 Res = Or(A, C); 1653 break; 1654 case 0xfb: 1655 if (ABCIsConst) 1656 Res = Nand(Nor(A, C), B); 1657 break; 1658 case 0xfc: 1659 Res = Or(A, B); 1660 break; 1661 case 0xfd: 1662 if (ABCIsConst) 1663 Res = Nand(Nor(A, B), C); 1664 break; 1665 case 0xfe: 1666 if (ABCIsConst) 1667 Res = Or(Or(A, B), C); 1668 break; 1669 case 0xff: 1670 Res = {Constant::getAllOnesValue(Ty), 0xff}; 1671 break; 1672 } 1673 1674 assert((Res.first == nullptr || Res.second == Imm) && 1675 "Simplification of ternary logic does not verify!"); 1676 return Res.first; 1677 } 1678 1679 static Value *simplifyX86insertps(const IntrinsicInst &II, 1680 InstCombiner::BuilderTy &Builder) { 1681 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1682 if (!CInt) 1683 return nullptr; 1684 1685 auto *VecTy = cast<FixedVectorType>(II.getType()); 1686 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 1687 1688 // The immediate permute control byte looks like this: 1689 // [3:0] - zero mask for each 32-bit lane 1690 // [5:4] - select one 32-bit destination lane 1691 // [7:6] - select one 32-bit source lane 1692 1693 uint8_t Imm = CInt->getZExtValue(); 1694 uint8_t ZMask = Imm & 0xf; 1695 uint8_t DestLane = (Imm >> 4) & 0x3; 1696 uint8_t SourceLane = (Imm >> 6) & 0x3; 1697 1698 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 1699 1700 // If all zero mask bits are set, this was just a weird way to 1701 // generate a zero vector. 1702 if (ZMask == 0xf) 1703 return ZeroVector; 1704 1705 // Initialize by passing all of the first source bits through. 1706 int ShuffleMask[4] = {0, 1, 2, 3}; 1707 1708 // We may replace the second operand with the zero vector. 1709 Value *V1 = II.getArgOperand(1); 1710 1711 if (ZMask) { 1712 // If the zero mask is being used with a single input or the zero mask 1713 // overrides the destination lane, this is a shuffle with the zero vector. 1714 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 1715 (ZMask & (1 << DestLane))) { 1716 V1 = ZeroVector; 1717 // We may still move 32-bits of the first source vector from one lane 1718 // to another. 1719 ShuffleMask[DestLane] = SourceLane; 1720 // The zero mask may override the previous insert operation. 1721 for (unsigned i = 0; i < 4; ++i) 1722 if ((ZMask >> i) & 0x1) 1723 ShuffleMask[i] = i + 4; 1724 } else { 1725 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 1726 return nullptr; 1727 } 1728 } else { 1729 // Replace the selected destination lane with the selected source lane. 1730 ShuffleMask[DestLane] = SourceLane + 4; 1731 } 1732 1733 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 1734 } 1735 1736 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 1737 /// or conversion to a shuffle vector. 1738 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 1739 ConstantInt *CILength, ConstantInt *CIIndex, 1740 InstCombiner::BuilderTy &Builder) { 1741 auto LowConstantHighUndef = [&](uint64_t Val) { 1742 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1743 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 1744 UndefValue::get(IntTy64)}; 1745 return ConstantVector::get(Args); 1746 }; 1747 1748 // See if we're dealing with constant values. 1749 auto *C0 = dyn_cast<Constant>(Op0); 1750 auto *CI0 = 1751 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1752 : nullptr; 1753 1754 // Attempt to constant fold. 1755 if (CILength && CIIndex) { 1756 // From AMD documentation: "The bit index and field length are each six 1757 // bits in length other bits of the field are ignored." 1758 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 1759 APInt APLength = CILength->getValue().zextOrTrunc(6); 1760 1761 unsigned Index = APIndex.getZExtValue(); 1762 1763 // From AMD documentation: "a value of zero in the field length is 1764 // defined as length of 64". 1765 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1766 1767 // From AMD documentation: "If the sum of the bit index + length field 1768 // is greater than 64, the results are undefined". 1769 unsigned End = Index + Length; 1770 1771 // Note that both field index and field length are 8-bit quantities. 1772 // Since variables 'Index' and 'Length' are unsigned values 1773 // obtained from zero-extending field index and field length 1774 // respectively, their sum should never wrap around. 1775 if (End > 64) 1776 return UndefValue::get(II.getType()); 1777 1778 // If we are inserting whole bytes, we can convert this to a shuffle. 1779 // Lowering can recognize EXTRQI shuffle masks. 1780 if ((Length % 8) == 0 && (Index % 8) == 0) { 1781 // Convert bit indices to byte indices. 1782 Length /= 8; 1783 Index /= 8; 1784 1785 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1786 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1787 1788 SmallVector<int, 16> ShuffleMask; 1789 for (int i = 0; i != (int)Length; ++i) 1790 ShuffleMask.push_back(i + Index); 1791 for (int i = Length; i != 8; ++i) 1792 ShuffleMask.push_back(i + 16); 1793 for (int i = 8; i != 16; ++i) 1794 ShuffleMask.push_back(-1); 1795 1796 Value *SV = Builder.CreateShuffleVector( 1797 Builder.CreateBitCast(Op0, ShufTy), 1798 ConstantAggregateZero::get(ShufTy), ShuffleMask); 1799 return Builder.CreateBitCast(SV, II.getType()); 1800 } 1801 1802 // Constant Fold - shift Index'th bit to lowest position and mask off 1803 // Length bits. 1804 if (CI0) { 1805 APInt Elt = CI0->getValue(); 1806 Elt.lshrInPlace(Index); 1807 Elt = Elt.zextOrTrunc(Length); 1808 return LowConstantHighUndef(Elt.getZExtValue()); 1809 } 1810 1811 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 1812 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 1813 Value *Args[] = {Op0, CILength, CIIndex}; 1814 Module *M = II.getModule(); 1815 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 1816 return Builder.CreateCall(F, Args); 1817 } 1818 } 1819 1820 // Constant Fold - extraction from zero is always {zero, undef}. 1821 if (CI0 && CI0->isZero()) 1822 return LowConstantHighUndef(0); 1823 1824 return nullptr; 1825 } 1826 1827 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 1828 /// folding or conversion to a shuffle vector. 1829 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 1830 APInt APLength, APInt APIndex, 1831 InstCombiner::BuilderTy &Builder) { 1832 // From AMD documentation: "The bit index and field length are each six bits 1833 // in length other bits of the field are ignored." 1834 APIndex = APIndex.zextOrTrunc(6); 1835 APLength = APLength.zextOrTrunc(6); 1836 1837 // Attempt to constant fold. 1838 unsigned Index = APIndex.getZExtValue(); 1839 1840 // From AMD documentation: "a value of zero in the field length is 1841 // defined as length of 64". 1842 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1843 1844 // From AMD documentation: "If the sum of the bit index + length field 1845 // is greater than 64, the results are undefined". 1846 unsigned End = Index + Length; 1847 1848 // Note that both field index and field length are 8-bit quantities. 1849 // Since variables 'Index' and 'Length' are unsigned values 1850 // obtained from zero-extending field index and field length 1851 // respectively, their sum should never wrap around. 1852 if (End > 64) 1853 return UndefValue::get(II.getType()); 1854 1855 // If we are inserting whole bytes, we can convert this to a shuffle. 1856 // Lowering can recognize INSERTQI shuffle masks. 1857 if ((Length % 8) == 0 && (Index % 8) == 0) { 1858 // Convert bit indices to byte indices. 1859 Length /= 8; 1860 Index /= 8; 1861 1862 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1863 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1864 1865 SmallVector<int, 16> ShuffleMask; 1866 for (int i = 0; i != (int)Index; ++i) 1867 ShuffleMask.push_back(i); 1868 for (int i = 0; i != (int)Length; ++i) 1869 ShuffleMask.push_back(i + 16); 1870 for (int i = Index + Length; i != 8; ++i) 1871 ShuffleMask.push_back(i); 1872 for (int i = 8; i != 16; ++i) 1873 ShuffleMask.push_back(-1); 1874 1875 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 1876 Builder.CreateBitCast(Op1, ShufTy), 1877 ShuffleMask); 1878 return Builder.CreateBitCast(SV, II.getType()); 1879 } 1880 1881 // See if we're dealing with constant values. 1882 auto *C0 = dyn_cast<Constant>(Op0); 1883 auto *C1 = dyn_cast<Constant>(Op1); 1884 auto *CI00 = 1885 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1886 : nullptr; 1887 auto *CI10 = 1888 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1889 : nullptr; 1890 1891 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 1892 if (CI00 && CI10) { 1893 APInt V00 = CI00->getValue(); 1894 APInt V10 = CI10->getValue(); 1895 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 1896 V00 = V00 & ~Mask; 1897 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 1898 APInt Val = V00 | V10; 1899 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1900 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 1901 UndefValue::get(IntTy64)}; 1902 return ConstantVector::get(Args); 1903 } 1904 1905 // If we were an INSERTQ call, we'll save demanded elements if we convert to 1906 // INSERTQI. 1907 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 1908 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1909 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 1910 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 1911 1912 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 1913 Module *M = II.getModule(); 1914 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 1915 return Builder.CreateCall(F, Args); 1916 } 1917 1918 return nullptr; 1919 } 1920 1921 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 1922 static Value *simplifyX86pshufb(const IntrinsicInst &II, 1923 InstCombiner::BuilderTy &Builder) { 1924 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1925 if (!V) 1926 return nullptr; 1927 1928 auto *VecTy = cast<FixedVectorType>(II.getType()); 1929 unsigned NumElts = VecTy->getNumElements(); 1930 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 1931 "Unexpected number of elements in shuffle mask!"); 1932 1933 // Construct a shuffle mask from constant integers or UNDEFs. 1934 int Indexes[64]; 1935 1936 // Each byte in the shuffle control mask forms an index to permute the 1937 // corresponding byte in the destination operand. 1938 for (unsigned I = 0; I < NumElts; ++I) { 1939 Constant *COp = V->getAggregateElement(I); 1940 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1941 return nullptr; 1942 1943 if (isa<UndefValue>(COp)) { 1944 Indexes[I] = -1; 1945 continue; 1946 } 1947 1948 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 1949 1950 // If the most significant bit (bit[7]) of each byte of the shuffle 1951 // control mask is set, then zero is written in the result byte. 1952 // The zero vector is in the right-hand side of the resulting 1953 // shufflevector. 1954 1955 // The value of each index for the high 128-bit lane is the least 1956 // significant 4 bits of the respective shuffle control byte. 1957 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 1958 Indexes[I] = Index; 1959 } 1960 1961 auto V1 = II.getArgOperand(0); 1962 auto V2 = Constant::getNullValue(VecTy); 1963 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts)); 1964 } 1965 1966 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 1967 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 1968 InstCombiner::BuilderTy &Builder) { 1969 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1970 if (!V) 1971 return nullptr; 1972 1973 auto *VecTy = cast<FixedVectorType>(II.getType()); 1974 unsigned NumElts = VecTy->getNumElements(); 1975 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 1976 unsigned NumLaneElts = IsPD ? 2 : 4; 1977 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 1978 1979 // Construct a shuffle mask from constant integers or UNDEFs. 1980 int Indexes[16]; 1981 1982 // The intrinsics only read one or two bits, clear the rest. 1983 for (unsigned I = 0; I < NumElts; ++I) { 1984 Constant *COp = V->getAggregateElement(I); 1985 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1986 return nullptr; 1987 1988 if (isa<UndefValue>(COp)) { 1989 Indexes[I] = -1; 1990 continue; 1991 } 1992 1993 APInt Index = cast<ConstantInt>(COp)->getValue(); 1994 Index = Index.zextOrTrunc(32).getLoBits(2); 1995 1996 // The PD variants uses bit 1 to select per-lane element index, so 1997 // shift down to convert to generic shuffle mask index. 1998 if (IsPD) 1999 Index.lshrInPlace(1); 2000 2001 // The _256 variants are a bit trickier since the mask bits always index 2002 // into the corresponding 128 half. In order to convert to a generic 2003 // shuffle, we have to make that explicit. 2004 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 2005 2006 Indexes[I] = Index.getZExtValue(); 2007 } 2008 2009 auto V1 = II.getArgOperand(0); 2010 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts)); 2011 } 2012 2013 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 2014 static Value *simplifyX86vpermv(const IntrinsicInst &II, 2015 InstCombiner::BuilderTy &Builder) { 2016 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2017 if (!V) 2018 return nullptr; 2019 2020 auto *VecTy = cast<FixedVectorType>(II.getType()); 2021 unsigned Size = VecTy->getNumElements(); 2022 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 2023 "Unexpected shuffle mask size"); 2024 2025 // Construct a shuffle mask from constant integers or UNDEFs. 2026 int Indexes[64]; 2027 2028 for (unsigned I = 0; I < Size; ++I) { 2029 Constant *COp = V->getAggregateElement(I); 2030 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2031 return nullptr; 2032 2033 if (isa<UndefValue>(COp)) { 2034 Indexes[I] = -1; 2035 continue; 2036 } 2037 2038 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 2039 Index &= Size - 1; 2040 Indexes[I] = Index; 2041 } 2042 2043 auto V1 = II.getArgOperand(0); 2044 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size)); 2045 } 2046 2047 std::optional<Instruction *> 2048 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 2049 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 2050 unsigned DemandedWidth) { 2051 APInt UndefElts(Width, 0); 2052 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 2053 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 2054 }; 2055 2056 Intrinsic::ID IID = II.getIntrinsicID(); 2057 switch (IID) { 2058 case Intrinsic::x86_bmi_bextr_32: 2059 case Intrinsic::x86_bmi_bextr_64: 2060 case Intrinsic::x86_tbm_bextri_u32: 2061 case Intrinsic::x86_tbm_bextri_u64: 2062 // If the RHS is a constant we can try some simplifications. 2063 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2064 uint64_t Shift = C->getZExtValue(); 2065 uint64_t Length = (Shift >> 8) & 0xff; 2066 Shift &= 0xff; 2067 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2068 // If the length is 0 or the shift is out of range, replace with zero. 2069 if (Length == 0 || Shift >= BitWidth) { 2070 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2071 } 2072 // If the LHS is also a constant, we can completely constant fold this. 2073 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2074 uint64_t Result = InC->getZExtValue() >> Shift; 2075 if (Length > BitWidth) 2076 Length = BitWidth; 2077 Result &= maskTrailingOnes<uint64_t>(Length); 2078 return IC.replaceInstUsesWith(II, 2079 ConstantInt::get(II.getType(), Result)); 2080 } 2081 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2082 // are only masking bits that a shift already cleared? 2083 } 2084 break; 2085 2086 case Intrinsic::x86_bmi_bzhi_32: 2087 case Intrinsic::x86_bmi_bzhi_64: 2088 // If the RHS is a constant we can try some simplifications. 2089 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2090 uint64_t Index = C->getZExtValue() & 0xff; 2091 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2092 if (Index >= BitWidth) { 2093 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2094 } 2095 if (Index == 0) { 2096 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2097 } 2098 // If the LHS is also a constant, we can completely constant fold this. 2099 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2100 uint64_t Result = InC->getZExtValue(); 2101 Result &= maskTrailingOnes<uint64_t>(Index); 2102 return IC.replaceInstUsesWith(II, 2103 ConstantInt::get(II.getType(), Result)); 2104 } 2105 // TODO should we convert this to an AND if the RHS is constant? 2106 } 2107 break; 2108 case Intrinsic::x86_bmi_pext_32: 2109 case Intrinsic::x86_bmi_pext_64: 2110 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2111 if (MaskC->isNullValue()) { 2112 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2113 } 2114 if (MaskC->isAllOnesValue()) { 2115 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2116 } 2117 2118 unsigned MaskIdx, MaskLen; 2119 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2120 // any single contingous sequence of 1s anywhere in the mask simply 2121 // describes a subset of the input bits shifted to the appropriate 2122 // position. Replace with the straight forward IR. 2123 Value *Input = II.getArgOperand(0); 2124 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 2125 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2126 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 2127 return IC.replaceInstUsesWith(II, Shifted); 2128 } 2129 2130 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2131 uint64_t Src = SrcC->getZExtValue(); 2132 uint64_t Mask = MaskC->getZExtValue(); 2133 uint64_t Result = 0; 2134 uint64_t BitToSet = 1; 2135 2136 while (Mask) { 2137 // Isolate lowest set bit. 2138 uint64_t BitToTest = Mask & -Mask; 2139 if (BitToTest & Src) 2140 Result |= BitToSet; 2141 2142 BitToSet <<= 1; 2143 // Clear lowest set bit. 2144 Mask &= Mask - 1; 2145 } 2146 2147 return IC.replaceInstUsesWith(II, 2148 ConstantInt::get(II.getType(), Result)); 2149 } 2150 } 2151 break; 2152 case Intrinsic::x86_bmi_pdep_32: 2153 case Intrinsic::x86_bmi_pdep_64: 2154 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2155 if (MaskC->isNullValue()) { 2156 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2157 } 2158 if (MaskC->isAllOnesValue()) { 2159 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2160 } 2161 2162 unsigned MaskIdx, MaskLen; 2163 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2164 // any single contingous sequence of 1s anywhere in the mask simply 2165 // describes a subset of the input bits shifted to the appropriate 2166 // position. Replace with the straight forward IR. 2167 Value *Input = II.getArgOperand(0); 2168 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2169 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 2170 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 2171 return IC.replaceInstUsesWith(II, Masked); 2172 } 2173 2174 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2175 uint64_t Src = SrcC->getZExtValue(); 2176 uint64_t Mask = MaskC->getZExtValue(); 2177 uint64_t Result = 0; 2178 uint64_t BitToTest = 1; 2179 2180 while (Mask) { 2181 // Isolate lowest set bit. 2182 uint64_t BitToSet = Mask & -Mask; 2183 if (BitToTest & Src) 2184 Result |= BitToSet; 2185 2186 BitToTest <<= 1; 2187 // Clear lowest set bit; 2188 Mask &= Mask - 1; 2189 } 2190 2191 return IC.replaceInstUsesWith(II, 2192 ConstantInt::get(II.getType(), Result)); 2193 } 2194 } 2195 break; 2196 2197 case Intrinsic::x86_sse_cvtss2si: 2198 case Intrinsic::x86_sse_cvtss2si64: 2199 case Intrinsic::x86_sse_cvttss2si: 2200 case Intrinsic::x86_sse_cvttss2si64: 2201 case Intrinsic::x86_sse2_cvtsd2si: 2202 case Intrinsic::x86_sse2_cvtsd2si64: 2203 case Intrinsic::x86_sse2_cvttsd2si: 2204 case Intrinsic::x86_sse2_cvttsd2si64: 2205 case Intrinsic::x86_avx512_vcvtss2si32: 2206 case Intrinsic::x86_avx512_vcvtss2si64: 2207 case Intrinsic::x86_avx512_vcvtss2usi32: 2208 case Intrinsic::x86_avx512_vcvtss2usi64: 2209 case Intrinsic::x86_avx512_vcvtsd2si32: 2210 case Intrinsic::x86_avx512_vcvtsd2si64: 2211 case Intrinsic::x86_avx512_vcvtsd2usi32: 2212 case Intrinsic::x86_avx512_vcvtsd2usi64: 2213 case Intrinsic::x86_avx512_cvttss2si: 2214 case Intrinsic::x86_avx512_cvttss2si64: 2215 case Intrinsic::x86_avx512_cvttss2usi: 2216 case Intrinsic::x86_avx512_cvttss2usi64: 2217 case Intrinsic::x86_avx512_cvttsd2si: 2218 case Intrinsic::x86_avx512_cvttsd2si64: 2219 case Intrinsic::x86_avx512_cvttsd2usi: 2220 case Intrinsic::x86_avx512_cvttsd2usi64: { 2221 // These intrinsics only demand the 0th element of their input vectors. If 2222 // we can simplify the input based on that, do so now. 2223 Value *Arg = II.getArgOperand(0); 2224 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 2225 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2226 return IC.replaceOperand(II, 0, V); 2227 } 2228 break; 2229 } 2230 2231 case Intrinsic::x86_mmx_pmovmskb: 2232 case Intrinsic::x86_sse_movmsk_ps: 2233 case Intrinsic::x86_sse2_movmsk_pd: 2234 case Intrinsic::x86_sse2_pmovmskb_128: 2235 case Intrinsic::x86_avx_movmsk_pd_256: 2236 case Intrinsic::x86_avx_movmsk_ps_256: 2237 case Intrinsic::x86_avx2_pmovmskb: 2238 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 2239 return IC.replaceInstUsesWith(II, V); 2240 } 2241 break; 2242 2243 case Intrinsic::x86_sse_comieq_ss: 2244 case Intrinsic::x86_sse_comige_ss: 2245 case Intrinsic::x86_sse_comigt_ss: 2246 case Intrinsic::x86_sse_comile_ss: 2247 case Intrinsic::x86_sse_comilt_ss: 2248 case Intrinsic::x86_sse_comineq_ss: 2249 case Intrinsic::x86_sse_ucomieq_ss: 2250 case Intrinsic::x86_sse_ucomige_ss: 2251 case Intrinsic::x86_sse_ucomigt_ss: 2252 case Intrinsic::x86_sse_ucomile_ss: 2253 case Intrinsic::x86_sse_ucomilt_ss: 2254 case Intrinsic::x86_sse_ucomineq_ss: 2255 case Intrinsic::x86_sse2_comieq_sd: 2256 case Intrinsic::x86_sse2_comige_sd: 2257 case Intrinsic::x86_sse2_comigt_sd: 2258 case Intrinsic::x86_sse2_comile_sd: 2259 case Intrinsic::x86_sse2_comilt_sd: 2260 case Intrinsic::x86_sse2_comineq_sd: 2261 case Intrinsic::x86_sse2_ucomieq_sd: 2262 case Intrinsic::x86_sse2_ucomige_sd: 2263 case Intrinsic::x86_sse2_ucomigt_sd: 2264 case Intrinsic::x86_sse2_ucomile_sd: 2265 case Intrinsic::x86_sse2_ucomilt_sd: 2266 case Intrinsic::x86_sse2_ucomineq_sd: 2267 case Intrinsic::x86_avx512_vcomi_ss: 2268 case Intrinsic::x86_avx512_vcomi_sd: 2269 case Intrinsic::x86_avx512_mask_cmp_ss: 2270 case Intrinsic::x86_avx512_mask_cmp_sd: { 2271 // These intrinsics only demand the 0th element of their input vectors. If 2272 // we can simplify the input based on that, do so now. 2273 bool MadeChange = false; 2274 Value *Arg0 = II.getArgOperand(0); 2275 Value *Arg1 = II.getArgOperand(1); 2276 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2277 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2278 IC.replaceOperand(II, 0, V); 2279 MadeChange = true; 2280 } 2281 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2282 IC.replaceOperand(II, 1, V); 2283 MadeChange = true; 2284 } 2285 if (MadeChange) { 2286 return &II; 2287 } 2288 break; 2289 } 2290 2291 case Intrinsic::x86_avx512_add_ps_512: 2292 case Intrinsic::x86_avx512_div_ps_512: 2293 case Intrinsic::x86_avx512_mul_ps_512: 2294 case Intrinsic::x86_avx512_sub_ps_512: 2295 case Intrinsic::x86_avx512_add_pd_512: 2296 case Intrinsic::x86_avx512_div_pd_512: 2297 case Intrinsic::x86_avx512_mul_pd_512: 2298 case Intrinsic::x86_avx512_sub_pd_512: 2299 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2300 // IR operations. 2301 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2302 if (R->getValue() == 4) { 2303 Value *Arg0 = II.getArgOperand(0); 2304 Value *Arg1 = II.getArgOperand(1); 2305 2306 Value *V; 2307 switch (IID) { 2308 default: 2309 llvm_unreachable("Case stmts out of sync!"); 2310 case Intrinsic::x86_avx512_add_ps_512: 2311 case Intrinsic::x86_avx512_add_pd_512: 2312 V = IC.Builder.CreateFAdd(Arg0, Arg1); 2313 break; 2314 case Intrinsic::x86_avx512_sub_ps_512: 2315 case Intrinsic::x86_avx512_sub_pd_512: 2316 V = IC.Builder.CreateFSub(Arg0, Arg1); 2317 break; 2318 case Intrinsic::x86_avx512_mul_ps_512: 2319 case Intrinsic::x86_avx512_mul_pd_512: 2320 V = IC.Builder.CreateFMul(Arg0, Arg1); 2321 break; 2322 case Intrinsic::x86_avx512_div_ps_512: 2323 case Intrinsic::x86_avx512_div_pd_512: 2324 V = IC.Builder.CreateFDiv(Arg0, Arg1); 2325 break; 2326 } 2327 2328 return IC.replaceInstUsesWith(II, V); 2329 } 2330 } 2331 break; 2332 2333 case Intrinsic::x86_avx512_mask_add_ss_round: 2334 case Intrinsic::x86_avx512_mask_div_ss_round: 2335 case Intrinsic::x86_avx512_mask_mul_ss_round: 2336 case Intrinsic::x86_avx512_mask_sub_ss_round: 2337 case Intrinsic::x86_avx512_mask_add_sd_round: 2338 case Intrinsic::x86_avx512_mask_div_sd_round: 2339 case Intrinsic::x86_avx512_mask_mul_sd_round: 2340 case Intrinsic::x86_avx512_mask_sub_sd_round: 2341 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2342 // IR operations. 2343 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 2344 if (R->getValue() == 4) { 2345 // Extract the element as scalars. 2346 Value *Arg0 = II.getArgOperand(0); 2347 Value *Arg1 = II.getArgOperand(1); 2348 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 2349 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 2350 2351 Value *V; 2352 switch (IID) { 2353 default: 2354 llvm_unreachable("Case stmts out of sync!"); 2355 case Intrinsic::x86_avx512_mask_add_ss_round: 2356 case Intrinsic::x86_avx512_mask_add_sd_round: 2357 V = IC.Builder.CreateFAdd(LHS, RHS); 2358 break; 2359 case Intrinsic::x86_avx512_mask_sub_ss_round: 2360 case Intrinsic::x86_avx512_mask_sub_sd_round: 2361 V = IC.Builder.CreateFSub(LHS, RHS); 2362 break; 2363 case Intrinsic::x86_avx512_mask_mul_ss_round: 2364 case Intrinsic::x86_avx512_mask_mul_sd_round: 2365 V = IC.Builder.CreateFMul(LHS, RHS); 2366 break; 2367 case Intrinsic::x86_avx512_mask_div_ss_round: 2368 case Intrinsic::x86_avx512_mask_div_sd_round: 2369 V = IC.Builder.CreateFDiv(LHS, RHS); 2370 break; 2371 } 2372 2373 // Handle the masking aspect of the intrinsic. 2374 Value *Mask = II.getArgOperand(3); 2375 auto *C = dyn_cast<ConstantInt>(Mask); 2376 // We don't need a select if we know the mask bit is a 1. 2377 if (!C || !C->getValue()[0]) { 2378 // Cast the mask to an i1 vector and then extract the lowest element. 2379 auto *MaskTy = FixedVectorType::get( 2380 IC.Builder.getInt1Ty(), 2381 cast<IntegerType>(Mask->getType())->getBitWidth()); 2382 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 2383 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 2384 // Extract the lowest element from the passthru operand. 2385 Value *Passthru = 2386 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 2387 V = IC.Builder.CreateSelect(Mask, V, Passthru); 2388 } 2389 2390 // Insert the result back into the original argument 0. 2391 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2392 2393 return IC.replaceInstUsesWith(II, V); 2394 } 2395 } 2396 break; 2397 2398 // Constant fold ashr( <A x Bi>, Ci ). 2399 // Constant fold lshr( <A x Bi>, Ci ). 2400 // Constant fold shl( <A x Bi>, Ci ). 2401 case Intrinsic::x86_sse2_psrai_d: 2402 case Intrinsic::x86_sse2_psrai_w: 2403 case Intrinsic::x86_avx2_psrai_d: 2404 case Intrinsic::x86_avx2_psrai_w: 2405 case Intrinsic::x86_avx512_psrai_q_128: 2406 case Intrinsic::x86_avx512_psrai_q_256: 2407 case Intrinsic::x86_avx512_psrai_d_512: 2408 case Intrinsic::x86_avx512_psrai_q_512: 2409 case Intrinsic::x86_avx512_psrai_w_512: 2410 case Intrinsic::x86_sse2_psrli_d: 2411 case Intrinsic::x86_sse2_psrli_q: 2412 case Intrinsic::x86_sse2_psrli_w: 2413 case Intrinsic::x86_avx2_psrli_d: 2414 case Intrinsic::x86_avx2_psrli_q: 2415 case Intrinsic::x86_avx2_psrli_w: 2416 case Intrinsic::x86_avx512_psrli_d_512: 2417 case Intrinsic::x86_avx512_psrli_q_512: 2418 case Intrinsic::x86_avx512_psrli_w_512: 2419 case Intrinsic::x86_sse2_pslli_d: 2420 case Intrinsic::x86_sse2_pslli_q: 2421 case Intrinsic::x86_sse2_pslli_w: 2422 case Intrinsic::x86_avx2_pslli_d: 2423 case Intrinsic::x86_avx2_pslli_q: 2424 case Intrinsic::x86_avx2_pslli_w: 2425 case Intrinsic::x86_avx512_pslli_d_512: 2426 case Intrinsic::x86_avx512_pslli_q_512: 2427 case Intrinsic::x86_avx512_pslli_w_512: 2428 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2429 return IC.replaceInstUsesWith(II, V); 2430 } 2431 break; 2432 2433 case Intrinsic::x86_sse2_psra_d: 2434 case Intrinsic::x86_sse2_psra_w: 2435 case Intrinsic::x86_avx2_psra_d: 2436 case Intrinsic::x86_avx2_psra_w: 2437 case Intrinsic::x86_avx512_psra_q_128: 2438 case Intrinsic::x86_avx512_psra_q_256: 2439 case Intrinsic::x86_avx512_psra_d_512: 2440 case Intrinsic::x86_avx512_psra_q_512: 2441 case Intrinsic::x86_avx512_psra_w_512: 2442 case Intrinsic::x86_sse2_psrl_d: 2443 case Intrinsic::x86_sse2_psrl_q: 2444 case Intrinsic::x86_sse2_psrl_w: 2445 case Intrinsic::x86_avx2_psrl_d: 2446 case Intrinsic::x86_avx2_psrl_q: 2447 case Intrinsic::x86_avx2_psrl_w: 2448 case Intrinsic::x86_avx512_psrl_d_512: 2449 case Intrinsic::x86_avx512_psrl_q_512: 2450 case Intrinsic::x86_avx512_psrl_w_512: 2451 case Intrinsic::x86_sse2_psll_d: 2452 case Intrinsic::x86_sse2_psll_q: 2453 case Intrinsic::x86_sse2_psll_w: 2454 case Intrinsic::x86_avx2_psll_d: 2455 case Intrinsic::x86_avx2_psll_q: 2456 case Intrinsic::x86_avx2_psll_w: 2457 case Intrinsic::x86_avx512_psll_d_512: 2458 case Intrinsic::x86_avx512_psll_q_512: 2459 case Intrinsic::x86_avx512_psll_w_512: { 2460 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2461 return IC.replaceInstUsesWith(II, V); 2462 } 2463 2464 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2465 // operand to compute the shift amount. 2466 Value *Arg1 = II.getArgOperand(1); 2467 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2468 "Unexpected packed shift size"); 2469 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 2470 2471 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2472 return IC.replaceOperand(II, 1, V); 2473 } 2474 break; 2475 } 2476 2477 case Intrinsic::x86_avx2_psllv_d: 2478 case Intrinsic::x86_avx2_psllv_d_256: 2479 case Intrinsic::x86_avx2_psllv_q: 2480 case Intrinsic::x86_avx2_psllv_q_256: 2481 case Intrinsic::x86_avx512_psllv_d_512: 2482 case Intrinsic::x86_avx512_psllv_q_512: 2483 case Intrinsic::x86_avx512_psllv_w_128: 2484 case Intrinsic::x86_avx512_psllv_w_256: 2485 case Intrinsic::x86_avx512_psllv_w_512: 2486 case Intrinsic::x86_avx2_psrav_d: 2487 case Intrinsic::x86_avx2_psrav_d_256: 2488 case Intrinsic::x86_avx512_psrav_q_128: 2489 case Intrinsic::x86_avx512_psrav_q_256: 2490 case Intrinsic::x86_avx512_psrav_d_512: 2491 case Intrinsic::x86_avx512_psrav_q_512: 2492 case Intrinsic::x86_avx512_psrav_w_128: 2493 case Intrinsic::x86_avx512_psrav_w_256: 2494 case Intrinsic::x86_avx512_psrav_w_512: 2495 case Intrinsic::x86_avx2_psrlv_d: 2496 case Intrinsic::x86_avx2_psrlv_d_256: 2497 case Intrinsic::x86_avx2_psrlv_q: 2498 case Intrinsic::x86_avx2_psrlv_q_256: 2499 case Intrinsic::x86_avx512_psrlv_d_512: 2500 case Intrinsic::x86_avx512_psrlv_q_512: 2501 case Intrinsic::x86_avx512_psrlv_w_128: 2502 case Intrinsic::x86_avx512_psrlv_w_256: 2503 case Intrinsic::x86_avx512_psrlv_w_512: 2504 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 2505 return IC.replaceInstUsesWith(II, V); 2506 } 2507 break; 2508 2509 case Intrinsic::x86_sse2_packssdw_128: 2510 case Intrinsic::x86_sse2_packsswb_128: 2511 case Intrinsic::x86_avx2_packssdw: 2512 case Intrinsic::x86_avx2_packsswb: 2513 case Intrinsic::x86_avx512_packssdw_512: 2514 case Intrinsic::x86_avx512_packsswb_512: 2515 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 2516 return IC.replaceInstUsesWith(II, V); 2517 } 2518 break; 2519 2520 case Intrinsic::x86_sse2_packuswb_128: 2521 case Intrinsic::x86_sse41_packusdw: 2522 case Intrinsic::x86_avx2_packusdw: 2523 case Intrinsic::x86_avx2_packuswb: 2524 case Intrinsic::x86_avx512_packusdw_512: 2525 case Intrinsic::x86_avx512_packuswb_512: 2526 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 2527 return IC.replaceInstUsesWith(II, V); 2528 } 2529 break; 2530 2531 case Intrinsic::x86_sse2_pmadd_wd: 2532 case Intrinsic::x86_avx2_pmadd_wd: 2533 case Intrinsic::x86_avx512_pmaddw_d_512: 2534 if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) { 2535 return IC.replaceInstUsesWith(II, V); 2536 } 2537 break; 2538 2539 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 2540 case Intrinsic::x86_avx2_pmadd_ub_sw: 2541 case Intrinsic::x86_avx512_pmaddubs_w_512: 2542 if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) { 2543 return IC.replaceInstUsesWith(II, V); 2544 } 2545 break; 2546 2547 case Intrinsic::x86_pclmulqdq: 2548 case Intrinsic::x86_pclmulqdq_256: 2549 case Intrinsic::x86_pclmulqdq_512: { 2550 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2551 unsigned Imm = C->getZExtValue(); 2552 2553 bool MadeChange = false; 2554 Value *Arg0 = II.getArgOperand(0); 2555 Value *Arg1 = II.getArgOperand(1); 2556 unsigned VWidth = 2557 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2558 2559 APInt UndefElts1(VWidth, 0); 2560 APInt DemandedElts1 = 2561 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 2562 if (Value *V = 2563 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 2564 IC.replaceOperand(II, 0, V); 2565 MadeChange = true; 2566 } 2567 2568 APInt UndefElts2(VWidth, 0); 2569 APInt DemandedElts2 = 2570 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 2571 if (Value *V = 2572 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 2573 IC.replaceOperand(II, 1, V); 2574 MadeChange = true; 2575 } 2576 2577 // If either input elements are undef, the result is zero. 2578 if (DemandedElts1.isSubsetOf(UndefElts1) || 2579 DemandedElts2.isSubsetOf(UndefElts2)) { 2580 return IC.replaceInstUsesWith(II, 2581 ConstantAggregateZero::get(II.getType())); 2582 } 2583 2584 if (MadeChange) { 2585 return &II; 2586 } 2587 } 2588 break; 2589 } 2590 2591 case Intrinsic::x86_sse41_insertps: 2592 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 2593 return IC.replaceInstUsesWith(II, V); 2594 } 2595 break; 2596 2597 case Intrinsic::x86_sse4a_extrq: { 2598 Value *Op0 = II.getArgOperand(0); 2599 Value *Op1 = II.getArgOperand(1); 2600 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2601 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2602 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2603 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2604 VWidth1 == 16 && "Unexpected operand sizes"); 2605 2606 // See if we're dealing with constant values. 2607 auto *C1 = dyn_cast<Constant>(Op1); 2608 auto *CILength = 2609 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2610 : nullptr; 2611 auto *CIIndex = 2612 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2613 : nullptr; 2614 2615 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2616 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2617 return IC.replaceInstUsesWith(II, V); 2618 } 2619 2620 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2621 // operands and the lowest 16-bits of the second. 2622 bool MadeChange = false; 2623 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2624 IC.replaceOperand(II, 0, V); 2625 MadeChange = true; 2626 } 2627 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2628 IC.replaceOperand(II, 1, V); 2629 MadeChange = true; 2630 } 2631 if (MadeChange) { 2632 return &II; 2633 } 2634 break; 2635 } 2636 2637 case Intrinsic::x86_sse4a_extrqi: { 2638 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2639 // bits of the lower 64-bits. The upper 64-bits are undefined. 2640 Value *Op0 = II.getArgOperand(0); 2641 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2642 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2643 "Unexpected operand size"); 2644 2645 // See if we're dealing with constant values. 2646 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 2647 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2648 2649 // Attempt to simplify to a constant or shuffle vector. 2650 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2651 return IC.replaceInstUsesWith(II, V); 2652 } 2653 2654 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2655 // operand. 2656 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2657 return IC.replaceOperand(II, 0, V); 2658 } 2659 break; 2660 } 2661 2662 case Intrinsic::x86_sse4a_insertq: { 2663 Value *Op0 = II.getArgOperand(0); 2664 Value *Op1 = II.getArgOperand(1); 2665 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2666 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2667 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2668 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 2669 "Unexpected operand size"); 2670 2671 // See if we're dealing with constant values. 2672 auto *C1 = dyn_cast<Constant>(Op1); 2673 auto *CI11 = 2674 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2675 : nullptr; 2676 2677 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2678 if (CI11) { 2679 const APInt &V11 = CI11->getValue(); 2680 APInt Len = V11.zextOrTrunc(6); 2681 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2682 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2683 return IC.replaceInstUsesWith(II, V); 2684 } 2685 } 2686 2687 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2688 // operand. 2689 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2690 return IC.replaceOperand(II, 0, V); 2691 } 2692 break; 2693 } 2694 2695 case Intrinsic::x86_sse4a_insertqi: { 2696 // INSERTQI: Extract lowest Length bits from lower half of second source and 2697 // insert over first source starting at Index bit. The upper 64-bits are 2698 // undefined. 2699 Value *Op0 = II.getArgOperand(0); 2700 Value *Op1 = II.getArgOperand(1); 2701 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2702 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2703 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2704 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2705 VWidth1 == 2 && "Unexpected operand sizes"); 2706 2707 // See if we're dealing with constant values. 2708 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2709 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 2710 2711 // Attempt to simplify to a constant or shuffle vector. 2712 if (CILength && CIIndex) { 2713 APInt Len = CILength->getValue().zextOrTrunc(6); 2714 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 2715 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2716 return IC.replaceInstUsesWith(II, V); 2717 } 2718 } 2719 2720 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 2721 // operands. 2722 bool MadeChange = false; 2723 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2724 IC.replaceOperand(II, 0, V); 2725 MadeChange = true; 2726 } 2727 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 2728 IC.replaceOperand(II, 1, V); 2729 MadeChange = true; 2730 } 2731 if (MadeChange) { 2732 return &II; 2733 } 2734 break; 2735 } 2736 2737 case Intrinsic::x86_sse41_pblendvb: 2738 case Intrinsic::x86_sse41_blendvps: 2739 case Intrinsic::x86_sse41_blendvpd: 2740 case Intrinsic::x86_avx_blendv_ps_256: 2741 case Intrinsic::x86_avx_blendv_pd_256: 2742 case Intrinsic::x86_avx2_pblendvb: { 2743 // fold (blend A, A, Mask) -> A 2744 Value *Op0 = II.getArgOperand(0); 2745 Value *Op1 = II.getArgOperand(1); 2746 Value *Mask = II.getArgOperand(2); 2747 if (Op0 == Op1) { 2748 return IC.replaceInstUsesWith(II, Op0); 2749 } 2750 2751 // Zero Mask - select 1st argument. 2752 if (isa<ConstantAggregateZero>(Mask)) { 2753 return IC.replaceInstUsesWith(II, Op0); 2754 } 2755 2756 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 2757 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 2758 Constant *NewSelector = 2759 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout()); 2760 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 2761 } 2762 2763 // Convert to a vector select if we can bypass casts and find a boolean 2764 // vector condition value. 2765 Value *BoolVec; 2766 Mask = InstCombiner::peekThroughBitcast(Mask); 2767 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 2768 BoolVec->getType()->isVectorTy() && 2769 BoolVec->getType()->getScalarSizeInBits() == 1) { 2770 auto *MaskTy = cast<FixedVectorType>(Mask->getType()); 2771 auto *OpTy = cast<FixedVectorType>(II.getType()); 2772 assert(MaskTy->getPrimitiveSizeInBits() == 2773 OpTy->getPrimitiveSizeInBits() && 2774 "Not expecting mask and operands with different sizes"); 2775 unsigned NumMaskElts = MaskTy->getNumElements(); 2776 unsigned NumOperandElts = OpTy->getNumElements(); 2777 2778 if (NumMaskElts == NumOperandElts) { 2779 return SelectInst::Create(BoolVec, Op1, Op0); 2780 } 2781 2782 // If the mask has less elements than the operands, each mask bit maps to 2783 // multiple elements of the operands. Bitcast back and forth. 2784 if (NumMaskElts < NumOperandElts) { 2785 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy); 2786 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy); 2787 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 2788 return new BitCastInst(Sel, II.getType()); 2789 } 2790 } 2791 2792 break; 2793 } 2794 2795 case Intrinsic::x86_ssse3_pshuf_b_128: 2796 case Intrinsic::x86_avx2_pshuf_b: 2797 case Intrinsic::x86_avx512_pshuf_b_512: 2798 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 2799 return IC.replaceInstUsesWith(II, V); 2800 } 2801 break; 2802 2803 case Intrinsic::x86_avx_vpermilvar_ps: 2804 case Intrinsic::x86_avx_vpermilvar_ps_256: 2805 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2806 case Intrinsic::x86_avx_vpermilvar_pd: 2807 case Intrinsic::x86_avx_vpermilvar_pd_256: 2808 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2809 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 2810 return IC.replaceInstUsesWith(II, V); 2811 } 2812 break; 2813 2814 case Intrinsic::x86_avx2_permd: 2815 case Intrinsic::x86_avx2_permps: 2816 case Intrinsic::x86_avx512_permvar_df_256: 2817 case Intrinsic::x86_avx512_permvar_df_512: 2818 case Intrinsic::x86_avx512_permvar_di_256: 2819 case Intrinsic::x86_avx512_permvar_di_512: 2820 case Intrinsic::x86_avx512_permvar_hi_128: 2821 case Intrinsic::x86_avx512_permvar_hi_256: 2822 case Intrinsic::x86_avx512_permvar_hi_512: 2823 case Intrinsic::x86_avx512_permvar_qi_128: 2824 case Intrinsic::x86_avx512_permvar_qi_256: 2825 case Intrinsic::x86_avx512_permvar_qi_512: 2826 case Intrinsic::x86_avx512_permvar_sf_512: 2827 case Intrinsic::x86_avx512_permvar_si_512: 2828 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 2829 return IC.replaceInstUsesWith(II, V); 2830 } 2831 break; 2832 2833 case Intrinsic::x86_avx_maskload_ps: 2834 case Intrinsic::x86_avx_maskload_pd: 2835 case Intrinsic::x86_avx_maskload_ps_256: 2836 case Intrinsic::x86_avx_maskload_pd_256: 2837 case Intrinsic::x86_avx2_maskload_d: 2838 case Intrinsic::x86_avx2_maskload_q: 2839 case Intrinsic::x86_avx2_maskload_d_256: 2840 case Intrinsic::x86_avx2_maskload_q_256: 2841 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 2842 return I; 2843 } 2844 break; 2845 2846 case Intrinsic::x86_sse2_maskmov_dqu: 2847 case Intrinsic::x86_avx_maskstore_ps: 2848 case Intrinsic::x86_avx_maskstore_pd: 2849 case Intrinsic::x86_avx_maskstore_ps_256: 2850 case Intrinsic::x86_avx_maskstore_pd_256: 2851 case Intrinsic::x86_avx2_maskstore_d: 2852 case Intrinsic::x86_avx2_maskstore_q: 2853 case Intrinsic::x86_avx2_maskstore_d_256: 2854 case Intrinsic::x86_avx2_maskstore_q_256: 2855 if (simplifyX86MaskedStore(II, IC)) { 2856 return nullptr; 2857 } 2858 break; 2859 2860 case Intrinsic::x86_addcarry_32: 2861 case Intrinsic::x86_addcarry_64: 2862 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 2863 return IC.replaceInstUsesWith(II, V); 2864 } 2865 break; 2866 2867 case Intrinsic::x86_avx512_pternlog_d_128: 2868 case Intrinsic::x86_avx512_pternlog_d_256: 2869 case Intrinsic::x86_avx512_pternlog_d_512: 2870 case Intrinsic::x86_avx512_pternlog_q_128: 2871 case Intrinsic::x86_avx512_pternlog_q_256: 2872 case Intrinsic::x86_avx512_pternlog_q_512: 2873 if (Value *V = simplifyTernarylogic(II, IC.Builder)) { 2874 return IC.replaceInstUsesWith(II, V); 2875 } 2876 break; 2877 default: 2878 break; 2879 } 2880 return std::nullopt; 2881 } 2882 2883 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 2884 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 2885 bool &KnownBitsComputed) const { 2886 switch (II.getIntrinsicID()) { 2887 default: 2888 break; 2889 case Intrinsic::x86_mmx_pmovmskb: 2890 case Intrinsic::x86_sse_movmsk_ps: 2891 case Intrinsic::x86_sse2_movmsk_pd: 2892 case Intrinsic::x86_sse2_pmovmskb_128: 2893 case Intrinsic::x86_avx_movmsk_ps_256: 2894 case Intrinsic::x86_avx_movmsk_pd_256: 2895 case Intrinsic::x86_avx2_pmovmskb: { 2896 // MOVMSK copies the vector elements' sign bits to the low bits 2897 // and zeros the high bits. 2898 unsigned ArgWidth; 2899 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 2900 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 2901 } else { 2902 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 2903 ArgWidth = ArgType->getNumElements(); 2904 } 2905 2906 // If we don't need any of low bits then return zero, 2907 // we know that DemandedMask is non-zero already. 2908 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 2909 Type *VTy = II.getType(); 2910 if (DemandedElts.isZero()) { 2911 return ConstantInt::getNullValue(VTy); 2912 } 2913 2914 // We know that the upper bits are set to zero. 2915 Known.Zero.setBitsFrom(ArgWidth); 2916 KnownBitsComputed = true; 2917 break; 2918 } 2919 } 2920 return std::nullopt; 2921 } 2922 2923 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 2924 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 2925 APInt &UndefElts2, APInt &UndefElts3, 2926 std::function<void(Instruction *, unsigned, APInt, APInt &)> 2927 simplifyAndSetOp) const { 2928 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 2929 switch (II.getIntrinsicID()) { 2930 default: 2931 break; 2932 case Intrinsic::x86_xop_vfrcz_ss: 2933 case Intrinsic::x86_xop_vfrcz_sd: 2934 // The instructions for these intrinsics are speced to zero upper bits not 2935 // pass them through like other scalar intrinsics. So we shouldn't just 2936 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 2937 // Instead we should return a zero vector. 2938 if (!DemandedElts[0]) { 2939 IC.addToWorklist(&II); 2940 return ConstantAggregateZero::get(II.getType()); 2941 } 2942 2943 // Only the lower element is used. 2944 DemandedElts = 1; 2945 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2946 2947 // Only the lower element is undefined. The high elements are zero. 2948 UndefElts = UndefElts[0]; 2949 break; 2950 2951 // Unary scalar-as-vector operations that work column-wise. 2952 case Intrinsic::x86_sse_rcp_ss: 2953 case Intrinsic::x86_sse_rsqrt_ss: 2954 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2955 2956 // If lowest element of a scalar op isn't used then use Arg0. 2957 if (!DemandedElts[0]) { 2958 IC.addToWorklist(&II); 2959 return II.getArgOperand(0); 2960 } 2961 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 2962 // checks). 2963 break; 2964 2965 // Binary scalar-as-vector operations that work column-wise. The high 2966 // elements come from operand 0. The low element is a function of both 2967 // operands. 2968 case Intrinsic::x86_sse_min_ss: 2969 case Intrinsic::x86_sse_max_ss: 2970 case Intrinsic::x86_sse_cmp_ss: 2971 case Intrinsic::x86_sse2_min_sd: 2972 case Intrinsic::x86_sse2_max_sd: 2973 case Intrinsic::x86_sse2_cmp_sd: { 2974 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 2975 2976 // If lowest element of a scalar op isn't used then use Arg0. 2977 if (!DemandedElts[0]) { 2978 IC.addToWorklist(&II); 2979 return II.getArgOperand(0); 2980 } 2981 2982 // Only lower element is used for operand 1. 2983 DemandedElts = 1; 2984 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 2985 2986 // Lower element is undefined if both lower elements are undefined. 2987 // Consider things like undef&0. The result is known zero, not undef. 2988 if (!UndefElts2[0]) 2989 UndefElts.clearBit(0); 2990 2991 break; 2992 } 2993 2994 // Binary scalar-as-vector operations that work column-wise. The high 2995 // elements come from operand 0 and the low element comes from operand 1. 2996 case Intrinsic::x86_sse41_round_ss: 2997 case Intrinsic::x86_sse41_round_sd: { 2998 // Don't use the low element of operand 0. 2999 APInt DemandedElts2 = DemandedElts; 3000 DemandedElts2.clearBit(0); 3001 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 3002 3003 // If lowest element of a scalar op isn't used then use Arg0. 3004 if (!DemandedElts[0]) { 3005 IC.addToWorklist(&II); 3006 return II.getArgOperand(0); 3007 } 3008 3009 // Only lower element is used for operand 1. 3010 DemandedElts = 1; 3011 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3012 3013 // Take the high undef elements from operand 0 and take the lower element 3014 // from operand 1. 3015 UndefElts.clearBit(0); 3016 UndefElts |= UndefElts2[0]; 3017 break; 3018 } 3019 3020 // Three input scalar-as-vector operations that work column-wise. The high 3021 // elements come from operand 0 and the low element is a function of all 3022 // three inputs. 3023 case Intrinsic::x86_avx512_mask_add_ss_round: 3024 case Intrinsic::x86_avx512_mask_div_ss_round: 3025 case Intrinsic::x86_avx512_mask_mul_ss_round: 3026 case Intrinsic::x86_avx512_mask_sub_ss_round: 3027 case Intrinsic::x86_avx512_mask_max_ss_round: 3028 case Intrinsic::x86_avx512_mask_min_ss_round: 3029 case Intrinsic::x86_avx512_mask_add_sd_round: 3030 case Intrinsic::x86_avx512_mask_div_sd_round: 3031 case Intrinsic::x86_avx512_mask_mul_sd_round: 3032 case Intrinsic::x86_avx512_mask_sub_sd_round: 3033 case Intrinsic::x86_avx512_mask_max_sd_round: 3034 case Intrinsic::x86_avx512_mask_min_sd_round: 3035 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3036 3037 // If lowest element of a scalar op isn't used then use Arg0. 3038 if (!DemandedElts[0]) { 3039 IC.addToWorklist(&II); 3040 return II.getArgOperand(0); 3041 } 3042 3043 // Only lower element is used for operand 1 and 2. 3044 DemandedElts = 1; 3045 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3046 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 3047 3048 // Lower element is undefined if all three lower elements are undefined. 3049 // Consider things like undef&0. The result is known zero, not undef. 3050 if (!UndefElts2[0] || !UndefElts3[0]) 3051 UndefElts.clearBit(0); 3052 break; 3053 3054 // TODO: Add fmaddsub support? 3055 case Intrinsic::x86_sse3_addsub_pd: 3056 case Intrinsic::x86_sse3_addsub_ps: 3057 case Intrinsic::x86_avx_addsub_pd_256: 3058 case Intrinsic::x86_avx_addsub_ps_256: { 3059 // If none of the even or none of the odd lanes are required, turn this 3060 // into a generic FP math instruction. 3061 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 3062 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 3063 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 3064 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 3065 if (IsSubOnly || IsAddOnly) { 3066 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 3067 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 3068 IC.Builder.SetInsertPoint(&II); 3069 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 3070 return IC.Builder.CreateBinOp( 3071 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 3072 } 3073 3074 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3075 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3076 UndefElts &= UndefElts2; 3077 break; 3078 } 3079 3080 // General per-element vector operations. 3081 case Intrinsic::x86_avx2_psllv_d: 3082 case Intrinsic::x86_avx2_psllv_d_256: 3083 case Intrinsic::x86_avx2_psllv_q: 3084 case Intrinsic::x86_avx2_psllv_q_256: 3085 case Intrinsic::x86_avx2_psrlv_d: 3086 case Intrinsic::x86_avx2_psrlv_d_256: 3087 case Intrinsic::x86_avx2_psrlv_q: 3088 case Intrinsic::x86_avx2_psrlv_q_256: 3089 case Intrinsic::x86_avx2_psrav_d: 3090 case Intrinsic::x86_avx2_psrav_d_256: { 3091 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3092 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3093 UndefElts &= UndefElts2; 3094 break; 3095 } 3096 3097 case Intrinsic::x86_sse2_packssdw_128: 3098 case Intrinsic::x86_sse2_packsswb_128: 3099 case Intrinsic::x86_sse2_packuswb_128: 3100 case Intrinsic::x86_sse41_packusdw: 3101 case Intrinsic::x86_avx2_packssdw: 3102 case Intrinsic::x86_avx2_packsswb: 3103 case Intrinsic::x86_avx2_packusdw: 3104 case Intrinsic::x86_avx2_packuswb: 3105 case Intrinsic::x86_avx512_packssdw_512: 3106 case Intrinsic::x86_avx512_packsswb_512: 3107 case Intrinsic::x86_avx512_packusdw_512: 3108 case Intrinsic::x86_avx512_packuswb_512: { 3109 auto *Ty0 = II.getArgOperand(0)->getType(); 3110 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 3111 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 3112 3113 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 3114 unsigned VWidthPerLane = VWidth / NumLanes; 3115 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 3116 3117 // Per lane, pack the elements of the first input and then the second. 3118 // e.g. 3119 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 3120 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 3121 for (int OpNum = 0; OpNum != 2; ++OpNum) { 3122 APInt OpDemandedElts(InnerVWidth, 0); 3123 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3124 unsigned LaneIdx = Lane * VWidthPerLane; 3125 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 3126 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 3127 if (DemandedElts[Idx]) 3128 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 3129 } 3130 } 3131 3132 // Demand elements from the operand. 3133 APInt OpUndefElts(InnerVWidth, 0); 3134 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 3135 3136 // Pack the operand's UNDEF elements, one lane at a time. 3137 OpUndefElts = OpUndefElts.zext(VWidth); 3138 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3139 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 3140 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 3141 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 3142 UndefElts |= LaneElts; 3143 } 3144 } 3145 break; 3146 } 3147 3148 case Intrinsic::x86_sse2_pmadd_wd: 3149 case Intrinsic::x86_avx2_pmadd_wd: 3150 case Intrinsic::x86_avx512_pmaddw_d_512: 3151 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 3152 case Intrinsic::x86_avx2_pmadd_ub_sw: 3153 case Intrinsic::x86_avx512_pmaddubs_w_512: { 3154 // PMADD - demand both src elements that map to each dst element. 3155 auto *ArgTy = II.getArgOperand(0)->getType(); 3156 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements(); 3157 assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); 3158 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth); 3159 APInt Op0UndefElts(InnerVWidth, 0); 3160 APInt Op1UndefElts(InnerVWidth, 0); 3161 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts); 3162 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts); 3163 break; 3164 } 3165 3166 // PSHUFB 3167 case Intrinsic::x86_ssse3_pshuf_b_128: 3168 case Intrinsic::x86_avx2_pshuf_b: 3169 case Intrinsic::x86_avx512_pshuf_b_512: 3170 // PERMILVAR 3171 case Intrinsic::x86_avx_vpermilvar_ps: 3172 case Intrinsic::x86_avx_vpermilvar_ps_256: 3173 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3174 case Intrinsic::x86_avx_vpermilvar_pd: 3175 case Intrinsic::x86_avx_vpermilvar_pd_256: 3176 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3177 // PERMV 3178 case Intrinsic::x86_avx2_permd: 3179 case Intrinsic::x86_avx2_permps: { 3180 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 3181 break; 3182 } 3183 3184 // SSE4A instructions leave the upper 64-bits of the 128-bit result 3185 // in an undefined state. 3186 case Intrinsic::x86_sse4a_extrq: 3187 case Intrinsic::x86_sse4a_extrqi: 3188 case Intrinsic::x86_sse4a_insertq: 3189 case Intrinsic::x86_sse4a_insertqi: 3190 UndefElts.setHighBits(VWidth / 2); 3191 break; 3192 } 3193 return std::nullopt; 3194 } 3195