1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 #include <optional> 22 23 using namespace llvm; 24 using namespace llvm::PatternMatch; 25 26 #define DEBUG_TYPE "x86tti" 27 28 /// Return a constant boolean vector that has true elements in all positions 29 /// where the input constant data vector has an element with the sign bit set. 30 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) { 31 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 32 V = ConstantExpr::getBitCast(V, IntTy); 33 V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT, 34 Constant::getNullValue(IntTy), V, DL); 35 assert(V && "Vector must be foldable"); 36 return V; 37 } 38 39 /// Convert the x86 XMM integer vector mask to a vector of bools based on 40 /// each element's most significant bit (the sign bit). 41 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) { 42 // Fold Constant Mask. 43 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 44 return getNegativeIsTrueBoolVec(ConstantMask, DL); 45 46 // Mask was extended from a boolean vector. 47 Value *ExtMask; 48 if (match(Mask, m_SExt(m_Value(ExtMask))) && 49 ExtMask->getType()->isIntOrIntVectorTy(1)) 50 return ExtMask; 51 52 return nullptr; 53 } 54 55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 56 // XMM register mask efficiently, we could transform all x86 masked intrinsics 57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 59 Value *Ptr = II.getOperand(0); 60 Value *Mask = II.getOperand(1); 61 Constant *ZeroVec = Constant::getNullValue(II.getType()); 62 63 // Zero Mask - masked load instruction creates a zero vector. 64 if (isa<ConstantAggregateZero>(Mask)) 65 return IC.replaceInstUsesWith(II, ZeroVec); 66 67 // The mask is constant or extended from a bool vector. Convert this x86 68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 70 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 71 // the LLVM intrinsic definition for the pointer argument. 72 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 73 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 74 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 75 76 // The pass-through vector for an x86 masked load is a zero vector. 77 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 78 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 79 return IC.replaceInstUsesWith(II, NewMaskedLoad); 80 } 81 82 return nullptr; 83 } 84 85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 86 // XMM register mask efficiently, we could transform all x86 masked intrinsics 87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 89 Value *Ptr = II.getOperand(0); 90 Value *Mask = II.getOperand(1); 91 Value *Vec = II.getOperand(2); 92 93 // Zero Mask - this masked store instruction does nothing. 94 if (isa<ConstantAggregateZero>(Mask)) { 95 IC.eraseInstFromFunction(II); 96 return true; 97 } 98 99 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 100 // anything else at this level. 101 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 102 return false; 103 104 // The mask is constant or extended from a bool vector. Convert this x86 105 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 106 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 107 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 108 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 109 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 110 111 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 112 113 // 'Replace uses' doesn't work for stores. Erase the original masked store. 114 IC.eraseInstFromFunction(II); 115 return true; 116 } 117 118 return false; 119 } 120 121 static Value *simplifyX86immShift(const IntrinsicInst &II, 122 InstCombiner::BuilderTy &Builder) { 123 bool LogicalShift = false; 124 bool ShiftLeft = false; 125 bool IsImm = false; 126 127 switch (II.getIntrinsicID()) { 128 default: 129 llvm_unreachable("Unexpected intrinsic!"); 130 case Intrinsic::x86_sse2_psrai_d: 131 case Intrinsic::x86_sse2_psrai_w: 132 case Intrinsic::x86_avx2_psrai_d: 133 case Intrinsic::x86_avx2_psrai_w: 134 case Intrinsic::x86_avx512_psrai_q_128: 135 case Intrinsic::x86_avx512_psrai_q_256: 136 case Intrinsic::x86_avx512_psrai_d_512: 137 case Intrinsic::x86_avx512_psrai_q_512: 138 case Intrinsic::x86_avx512_psrai_w_512: 139 IsImm = true; 140 [[fallthrough]]; 141 case Intrinsic::x86_sse2_psra_d: 142 case Intrinsic::x86_sse2_psra_w: 143 case Intrinsic::x86_avx2_psra_d: 144 case Intrinsic::x86_avx2_psra_w: 145 case Intrinsic::x86_avx512_psra_q_128: 146 case Intrinsic::x86_avx512_psra_q_256: 147 case Intrinsic::x86_avx512_psra_d_512: 148 case Intrinsic::x86_avx512_psra_q_512: 149 case Intrinsic::x86_avx512_psra_w_512: 150 LogicalShift = false; 151 ShiftLeft = false; 152 break; 153 case Intrinsic::x86_sse2_psrli_d: 154 case Intrinsic::x86_sse2_psrli_q: 155 case Intrinsic::x86_sse2_psrli_w: 156 case Intrinsic::x86_avx2_psrli_d: 157 case Intrinsic::x86_avx2_psrli_q: 158 case Intrinsic::x86_avx2_psrli_w: 159 case Intrinsic::x86_avx512_psrli_d_512: 160 case Intrinsic::x86_avx512_psrli_q_512: 161 case Intrinsic::x86_avx512_psrli_w_512: 162 IsImm = true; 163 [[fallthrough]]; 164 case Intrinsic::x86_sse2_psrl_d: 165 case Intrinsic::x86_sse2_psrl_q: 166 case Intrinsic::x86_sse2_psrl_w: 167 case Intrinsic::x86_avx2_psrl_d: 168 case Intrinsic::x86_avx2_psrl_q: 169 case Intrinsic::x86_avx2_psrl_w: 170 case Intrinsic::x86_avx512_psrl_d_512: 171 case Intrinsic::x86_avx512_psrl_q_512: 172 case Intrinsic::x86_avx512_psrl_w_512: 173 LogicalShift = true; 174 ShiftLeft = false; 175 break; 176 case Intrinsic::x86_sse2_pslli_d: 177 case Intrinsic::x86_sse2_pslli_q: 178 case Intrinsic::x86_sse2_pslli_w: 179 case Intrinsic::x86_avx2_pslli_d: 180 case Intrinsic::x86_avx2_pslli_q: 181 case Intrinsic::x86_avx2_pslli_w: 182 case Intrinsic::x86_avx512_pslli_d_512: 183 case Intrinsic::x86_avx512_pslli_q_512: 184 case Intrinsic::x86_avx512_pslli_w_512: 185 IsImm = true; 186 [[fallthrough]]; 187 case Intrinsic::x86_sse2_psll_d: 188 case Intrinsic::x86_sse2_psll_q: 189 case Intrinsic::x86_sse2_psll_w: 190 case Intrinsic::x86_avx2_psll_d: 191 case Intrinsic::x86_avx2_psll_q: 192 case Intrinsic::x86_avx2_psll_w: 193 case Intrinsic::x86_avx512_psll_d_512: 194 case Intrinsic::x86_avx512_psll_q_512: 195 case Intrinsic::x86_avx512_psll_w_512: 196 LogicalShift = true; 197 ShiftLeft = true; 198 break; 199 } 200 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 201 202 Value *Vec = II.getArgOperand(0); 203 Value *Amt = II.getArgOperand(1); 204 auto *VT = cast<FixedVectorType>(Vec->getType()); 205 Type *SVT = VT->getElementType(); 206 Type *AmtVT = Amt->getType(); 207 unsigned VWidth = VT->getNumElements(); 208 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 209 210 // If the shift amount is guaranteed to be in-range we can replace it with a 211 // generic shift. If its guaranteed to be out of range, logical shifts combine 212 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 213 if (IsImm) { 214 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 215 KnownBits KnownAmtBits = 216 llvm::computeKnownBits(Amt, II.getDataLayout()); 217 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 218 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 219 Amt = Builder.CreateVectorSplat(VWidth, Amt); 220 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 221 : Builder.CreateLShr(Vec, Amt)) 222 : Builder.CreateAShr(Vec, Amt)); 223 } 224 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 225 if (LogicalShift) 226 return ConstantAggregateZero::get(VT); 227 Amt = ConstantInt::get(SVT, BitWidth - 1); 228 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 229 } 230 } else { 231 // Ensure the first element has an in-range value and the rest of the 232 // elements in the bottom 64 bits are zero. 233 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 234 cast<VectorType>(AmtVT)->getElementType() == SVT && 235 "Unexpected shift-by-scalar type"); 236 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 237 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 238 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 239 KnownBits KnownLowerBits = llvm::computeKnownBits( 240 Amt, DemandedLower, II.getDataLayout()); 241 KnownBits KnownUpperBits = llvm::computeKnownBits( 242 Amt, DemandedUpper, II.getDataLayout()); 243 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 244 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 245 SmallVector<int, 16> ZeroSplat(VWidth, 0); 246 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 247 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 248 : Builder.CreateLShr(Vec, Amt)) 249 : Builder.CreateAShr(Vec, Amt)); 250 } 251 } 252 253 // Simplify if count is constant vector. 254 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 255 if (!CDV) 256 return nullptr; 257 258 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 259 // operand to compute the shift amount. 260 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 261 cast<VectorType>(AmtVT)->getElementType() == SVT && 262 "Unexpected shift-by-scalar type"); 263 264 // Concatenate the sub-elements to create the 64-bit value. 265 APInt Count(64, 0); 266 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 267 unsigned SubEltIdx = (NumSubElts - 1) - i; 268 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 269 Count <<= BitWidth; 270 Count |= SubElt->getValue().zextOrTrunc(64); 271 } 272 273 // If shift-by-zero then just return the original value. 274 if (Count.isZero()) 275 return Vec; 276 277 // Handle cases when Shift >= BitWidth. 278 if (Count.uge(BitWidth)) { 279 // If LogicalShift - just return zero. 280 if (LogicalShift) 281 return ConstantAggregateZero::get(VT); 282 283 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 284 Count = APInt(64, BitWidth - 1); 285 } 286 287 // Get a constant vector of the same type as the first operand. 288 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 289 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 290 291 if (ShiftLeft) 292 return Builder.CreateShl(Vec, ShiftVec); 293 294 if (LogicalShift) 295 return Builder.CreateLShr(Vec, ShiftVec); 296 297 return Builder.CreateAShr(Vec, ShiftVec); 298 } 299 300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 303 static Value *simplifyX86varShift(const IntrinsicInst &II, 304 InstCombiner::BuilderTy &Builder) { 305 bool LogicalShift = false; 306 bool ShiftLeft = false; 307 308 switch (II.getIntrinsicID()) { 309 default: 310 llvm_unreachable("Unexpected intrinsic!"); 311 case Intrinsic::x86_avx2_psrav_d: 312 case Intrinsic::x86_avx2_psrav_d_256: 313 case Intrinsic::x86_avx512_psrav_q_128: 314 case Intrinsic::x86_avx512_psrav_q_256: 315 case Intrinsic::x86_avx512_psrav_d_512: 316 case Intrinsic::x86_avx512_psrav_q_512: 317 case Intrinsic::x86_avx512_psrav_w_128: 318 case Intrinsic::x86_avx512_psrav_w_256: 319 case Intrinsic::x86_avx512_psrav_w_512: 320 LogicalShift = false; 321 ShiftLeft = false; 322 break; 323 case Intrinsic::x86_avx2_psrlv_d: 324 case Intrinsic::x86_avx2_psrlv_d_256: 325 case Intrinsic::x86_avx2_psrlv_q: 326 case Intrinsic::x86_avx2_psrlv_q_256: 327 case Intrinsic::x86_avx512_psrlv_d_512: 328 case Intrinsic::x86_avx512_psrlv_q_512: 329 case Intrinsic::x86_avx512_psrlv_w_128: 330 case Intrinsic::x86_avx512_psrlv_w_256: 331 case Intrinsic::x86_avx512_psrlv_w_512: 332 LogicalShift = true; 333 ShiftLeft = false; 334 break; 335 case Intrinsic::x86_avx2_psllv_d: 336 case Intrinsic::x86_avx2_psllv_d_256: 337 case Intrinsic::x86_avx2_psllv_q: 338 case Intrinsic::x86_avx2_psllv_q_256: 339 case Intrinsic::x86_avx512_psllv_d_512: 340 case Intrinsic::x86_avx512_psllv_q_512: 341 case Intrinsic::x86_avx512_psllv_w_128: 342 case Intrinsic::x86_avx512_psllv_w_256: 343 case Intrinsic::x86_avx512_psllv_w_512: 344 LogicalShift = true; 345 ShiftLeft = true; 346 break; 347 } 348 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 349 350 Value *Vec = II.getArgOperand(0); 351 Value *Amt = II.getArgOperand(1); 352 auto *VT = cast<FixedVectorType>(II.getType()); 353 Type *SVT = VT->getElementType(); 354 int NumElts = VT->getNumElements(); 355 int BitWidth = SVT->getIntegerBitWidth(); 356 357 // If the shift amount is guaranteed to be in-range we can replace it with a 358 // generic shift. 359 KnownBits KnownAmt = 360 llvm::computeKnownBits(Amt, II.getDataLayout()); 361 if (KnownAmt.getMaxValue().ult(BitWidth)) { 362 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 363 : Builder.CreateLShr(Vec, Amt)) 364 : Builder.CreateAShr(Vec, Amt)); 365 } 366 367 // Simplify if all shift amounts are constant/undef. 368 auto *CShift = dyn_cast<Constant>(Amt); 369 if (!CShift) 370 return nullptr; 371 372 // Collect each element's shift amount. 373 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 374 bool AnyOutOfRange = false; 375 SmallVector<int, 8> ShiftAmts; 376 for (int I = 0; I < NumElts; ++I) { 377 auto *CElt = CShift->getAggregateElement(I); 378 if (isa_and_nonnull<UndefValue>(CElt)) { 379 ShiftAmts.push_back(-1); 380 continue; 381 } 382 383 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 384 if (!COp) 385 return nullptr; 386 387 // Handle out of range shifts. 388 // If LogicalShift - set to BitWidth (special case). 389 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 390 APInt ShiftVal = COp->getValue(); 391 if (ShiftVal.uge(BitWidth)) { 392 AnyOutOfRange = LogicalShift; 393 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 394 continue; 395 } 396 397 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 398 } 399 400 // If all elements out of range or UNDEF, return vector of zeros/undefs. 401 // ArithmeticShift should only hit this if they are all UNDEF. 402 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 403 if (llvm::all_of(ShiftAmts, OutOfRange)) { 404 SmallVector<Constant *, 8> ConstantVec; 405 for (int Idx : ShiftAmts) { 406 if (Idx < 0) { 407 ConstantVec.push_back(UndefValue::get(SVT)); 408 } else { 409 assert(LogicalShift && "Logical shift expected"); 410 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 411 } 412 } 413 return ConstantVector::get(ConstantVec); 414 } 415 416 // We can't handle only some out of range values with generic logical shifts. 417 if (AnyOutOfRange) 418 return nullptr; 419 420 // Build the shift amount constant vector. 421 SmallVector<Constant *, 8> ShiftVecAmts; 422 for (int Idx : ShiftAmts) { 423 if (Idx < 0) 424 ShiftVecAmts.push_back(UndefValue::get(SVT)); 425 else 426 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 427 } 428 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 429 430 if (ShiftLeft) 431 return Builder.CreateShl(Vec, ShiftVec); 432 433 if (LogicalShift) 434 return Builder.CreateLShr(Vec, ShiftVec); 435 436 return Builder.CreateAShr(Vec, ShiftVec); 437 } 438 439 static Value *simplifyX86pack(IntrinsicInst &II, 440 InstCombiner::BuilderTy &Builder, bool IsSigned) { 441 Value *Arg0 = II.getArgOperand(0); 442 Value *Arg1 = II.getArgOperand(1); 443 Type *ResTy = II.getType(); 444 445 // Fast all undef handling. 446 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 447 return UndefValue::get(ResTy); 448 449 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 450 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 451 unsigned NumSrcElts = ArgTy->getNumElements(); 452 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 453 "Unexpected packing types"); 454 455 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 456 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 457 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 458 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 459 "Unexpected packing types"); 460 461 // Constant folding. 462 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 463 return nullptr; 464 465 // Clamp Values - signed/unsigned both use signed clamp values, but they 466 // differ on the min/max values. 467 APInt MinValue, MaxValue; 468 if (IsSigned) { 469 // PACKSS: Truncate signed value with signed saturation. 470 // Source values less than dst minint are saturated to minint. 471 // Source values greater than dst maxint are saturated to maxint. 472 MinValue = 473 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 474 MaxValue = 475 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 476 } else { 477 // PACKUS: Truncate signed value with unsigned saturation. 478 // Source values less than zero are saturated to zero. 479 // Source values greater than dst maxuint are saturated to maxuint. 480 MinValue = APInt::getZero(SrcScalarSizeInBits); 481 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 482 } 483 484 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 485 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 488 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 489 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 490 491 // Shuffle clamped args together at the lane level. 492 SmallVector<int, 32> PackMask; 493 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 496 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 497 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 498 } 499 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 500 501 // Truncate to dst size. 502 return Builder.CreateTrunc(Shuffle, ResTy); 503 } 504 505 static Value *simplifyX86pmulh(IntrinsicInst &II, 506 InstCombiner::BuilderTy &Builder, bool IsSigned, 507 bool IsRounding) { 508 Value *Arg0 = II.getArgOperand(0); 509 Value *Arg1 = II.getArgOperand(1); 510 auto *ResTy = cast<FixedVectorType>(II.getType()); 511 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 512 assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 && 513 "Unexpected PMULH types"); 514 assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed"); 515 516 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero. 517 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) 518 return ConstantAggregateZero::get(ResTy); 519 520 // Multiply by zero. 521 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) 522 return ConstantAggregateZero::get(ResTy); 523 524 // Multiply by one. 525 if (!IsRounding) { 526 if (match(Arg0, m_One())) 527 return IsSigned ? Builder.CreateAShr(Arg1, 15) 528 : ConstantAggregateZero::get(ResTy); 529 if (match(Arg1, m_One())) 530 return IsSigned ? Builder.CreateAShr(Arg0, 15) 531 : ConstantAggregateZero::get(ResTy); 532 } 533 534 // Constant folding. 535 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 536 return nullptr; 537 538 // Extend to twice the width and multiply. 539 auto Cast = 540 IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt; 541 auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy); 542 Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy); 543 Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy); 544 Value *Mul = Builder.CreateMul(LHS, RHS); 545 546 if (IsRounding) { 547 // PMULHRSW: truncate to vXi18 of the most significant bits, add one and 548 // extract bits[16:1]. 549 auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18); 550 auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy); 551 Mul = Builder.CreateLShr(Mul, 14); 552 Mul = Builder.CreateTrunc(Mul, RndTy); 553 Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1)); 554 Mul = Builder.CreateLShr(Mul, 1); 555 } else { 556 // PMULH/PMULHU: extract the vXi16 most significant bits. 557 Mul = Builder.CreateLShr(Mul, 16); 558 } 559 560 return Builder.CreateTrunc(Mul, ResTy); 561 } 562 563 static Value *simplifyX86pmadd(IntrinsicInst &II, 564 InstCombiner::BuilderTy &Builder, 565 bool IsPMADDWD) { 566 Value *Arg0 = II.getArgOperand(0); 567 Value *Arg1 = II.getArgOperand(1); 568 auto *ResTy = cast<FixedVectorType>(II.getType()); 569 [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 570 571 unsigned NumDstElts = ResTy->getNumElements(); 572 assert(ArgTy->getNumElements() == (2 * NumDstElts) && 573 ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) && 574 "Unexpected PMADD types"); 575 576 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero. 577 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) 578 return ConstantAggregateZero::get(ResTy); 579 580 // Multiply by zero. 581 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) 582 return ConstantAggregateZero::get(ResTy); 583 584 // Constant folding. 585 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 586 return nullptr; 587 588 // Split Lo/Hi elements pairs, extend and add together. 589 // PMADDWD(X,Y) = 590 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1]))) 591 // PMADDUBSW(X,Y) = 592 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1]))) 593 SmallVector<int> LoMask, HiMask; 594 for (unsigned I = 0; I != NumDstElts; ++I) { 595 LoMask.push_back(2 * I + 0); 596 HiMask.push_back(2 * I + 1); 597 } 598 599 auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask); 600 auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask); 601 auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask); 602 auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask); 603 604 auto LHSCast = 605 IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt; 606 LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy); 607 LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy); 608 RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy); 609 RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy); 610 Value *Lo = Builder.CreateMul(LHSLo, RHSLo); 611 Value *Hi = Builder.CreateMul(LHSHi, RHSHi); 612 return IsPMADDWD 613 ? Builder.CreateAdd(Lo, Hi) 614 : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi}); 615 } 616 617 static Value *simplifyX86movmsk(const IntrinsicInst &II, 618 InstCombiner::BuilderTy &Builder) { 619 Value *Arg = II.getArgOperand(0); 620 Type *ResTy = II.getType(); 621 622 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 623 if (isa<UndefValue>(Arg)) 624 return Constant::getNullValue(ResTy); 625 626 // Preserve previous behavior and give up. 627 // TODO: treat as <8 x i8>. 628 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) 629 return nullptr; 630 631 auto *ArgTy = cast<FixedVectorType>(Arg->getType()); 632 633 // Expand MOVMSK to compare/bitcast/zext: 634 // e.g. PMOVMSKB(v16i8 x): 635 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 636 // %int = bitcast <16 x i1> %cmp to i16 637 // %res = zext i16 %int to i32 638 unsigned NumElts = ArgTy->getNumElements(); 639 Type *IntegerTy = Builder.getIntNTy(NumElts); 640 641 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 642 Res = Builder.CreateIsNeg(Res); 643 Res = Builder.CreateBitCast(Res, IntegerTy); 644 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 645 return Res; 646 } 647 648 static Value *simplifyX86addcarry(const IntrinsicInst &II, 649 InstCombiner::BuilderTy &Builder) { 650 Value *CarryIn = II.getArgOperand(0); 651 Value *Op1 = II.getArgOperand(1); 652 Value *Op2 = II.getArgOperand(2); 653 Type *RetTy = II.getType(); 654 Type *OpTy = Op1->getType(); 655 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 656 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 657 "Unexpected types for x86 addcarry"); 658 659 // If carry-in is zero, this is just an unsigned add with overflow. 660 if (match(CarryIn, m_ZeroInt())) { 661 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 662 {Op1, Op2}); 663 // The types have to be adjusted to match the x86 call types. 664 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 665 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 666 Builder.getInt8Ty()); 667 Value *Res = PoisonValue::get(RetTy); 668 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 669 return Builder.CreateInsertValue(Res, UAddResult, 1); 670 } 671 672 return nullptr; 673 } 674 675 static Value *simplifyTernarylogic(const IntrinsicInst &II, 676 InstCombiner::BuilderTy &Builder) { 677 678 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3)); 679 if (!ArgImm || ArgImm->getValue().uge(256)) 680 return nullptr; 681 682 Value *ArgA = II.getArgOperand(0); 683 Value *ArgB = II.getArgOperand(1); 684 Value *ArgC = II.getArgOperand(2); 685 686 Type *Ty = II.getType(); 687 688 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 689 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second}; 690 }; 691 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 692 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second}; 693 }; 694 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 695 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second}; 696 }; 697 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> { 698 return {Builder.CreateNot(V.first), ~V.second}; 699 }; 700 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); }; 701 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); }; 702 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); }; 703 704 bool AIsConst = match(ArgA, m_ImmConstant()); 705 bool BIsConst = match(ArgB, m_ImmConstant()); 706 bool CIsConst = match(ArgC, m_ImmConstant()); 707 708 bool ABIsConst = AIsConst && BIsConst; 709 bool ACIsConst = AIsConst && CIsConst; 710 bool BCIsConst = BIsConst && CIsConst; 711 bool ABCIsConst = AIsConst && BIsConst && CIsConst; 712 713 // Use for verification. Its a big table. Its difficult to go from Imm -> 714 // logic ops, but easy to verify that a set of logic ops is correct. We track 715 // the logic ops through the second value in the pair. At the end it should 716 // equal Imm. 717 std::pair<Value *, uint8_t> A = {ArgA, 0xf0}; 718 std::pair<Value *, uint8_t> B = {ArgB, 0xcc}; 719 std::pair<Value *, uint8_t> C = {ArgC, 0xaa}; 720 std::pair<Value *, uint8_t> Res = {nullptr, 0}; 721 722 // Currently we only handle cases that convert directly to another instruction 723 // or cases where all the ops are constant. This is because we don't properly 724 // handle creating ternary ops in the backend, so splitting them here may 725 // cause regressions. As the backend improves, uncomment more cases. 726 727 uint8_t Imm = ArgImm->getValue().getZExtValue(); 728 switch (Imm) { 729 case 0x0: 730 Res = {Constant::getNullValue(Ty), 0}; 731 break; 732 case 0x1: 733 if (ABCIsConst) 734 Res = Nor(Or(A, B), C); 735 break; 736 case 0x2: 737 if (ABCIsConst) 738 Res = And(Nor(A, B), C); 739 break; 740 case 0x3: 741 if (ABIsConst) 742 Res = Nor(A, B); 743 break; 744 case 0x4: 745 if (ABCIsConst) 746 Res = And(Nor(A, C), B); 747 break; 748 case 0x5: 749 if (ACIsConst) 750 Res = Nor(A, C); 751 break; 752 case 0x6: 753 if (ABCIsConst) 754 Res = Nor(A, Xnor(B, C)); 755 break; 756 case 0x7: 757 if (ABCIsConst) 758 Res = Nor(A, And(B, C)); 759 break; 760 case 0x8: 761 if (ABCIsConst) 762 Res = Nor(A, Nand(B, C)); 763 break; 764 case 0x9: 765 if (ABCIsConst) 766 Res = Nor(A, Xor(B, C)); 767 break; 768 case 0xa: 769 if (ACIsConst) 770 Res = Nor(A, Not(C)); 771 break; 772 case 0xb: 773 if (ABCIsConst) 774 Res = Nor(A, Nor(C, Not(B))); 775 break; 776 case 0xc: 777 if (ABIsConst) 778 Res = Nor(A, Not(B)); 779 break; 780 case 0xd: 781 if (ABCIsConst) 782 Res = Nor(A, Nor(B, Not(C))); 783 break; 784 case 0xe: 785 if (ABCIsConst) 786 Res = Nor(A, Nor(B, C)); 787 break; 788 case 0xf: 789 Res = Not(A); 790 break; 791 case 0x10: 792 if (ABCIsConst) 793 Res = And(A, Nor(B, C)); 794 break; 795 case 0x11: 796 if (BCIsConst) 797 Res = Nor(B, C); 798 break; 799 case 0x12: 800 if (ABCIsConst) 801 Res = Nor(Xnor(A, C), B); 802 break; 803 case 0x13: 804 if (ABCIsConst) 805 Res = Nor(And(A, C), B); 806 break; 807 case 0x14: 808 if (ABCIsConst) 809 Res = Nor(Xnor(A, B), C); 810 break; 811 case 0x15: 812 if (ABCIsConst) 813 Res = Nor(And(A, B), C); 814 break; 815 case 0x16: 816 if (ABCIsConst) 817 Res = Xor(Xor(A, B), And(Nand(A, B), C)); 818 break; 819 case 0x17: 820 if (ABCIsConst) 821 Res = Xor(Or(A, B), Or(Xnor(A, B), C)); 822 break; 823 case 0x18: 824 if (ABCIsConst) 825 Res = Nor(Xnor(A, B), Xnor(A, C)); 826 break; 827 case 0x19: 828 if (ABCIsConst) 829 Res = And(Nand(A, B), Xnor(B, C)); 830 break; 831 case 0x1a: 832 if (ABCIsConst) 833 Res = Xor(A, Or(And(A, B), C)); 834 break; 835 case 0x1b: 836 if (ABCIsConst) 837 Res = Xor(A, Or(Xnor(A, B), C)); 838 break; 839 case 0x1c: 840 if (ABCIsConst) 841 Res = Xor(A, Or(And(A, C), B)); 842 break; 843 case 0x1d: 844 if (ABCIsConst) 845 Res = Xor(A, Or(Xnor(A, C), B)); 846 break; 847 case 0x1e: 848 if (ABCIsConst) 849 Res = Xor(A, Or(B, C)); 850 break; 851 case 0x1f: 852 if (ABCIsConst) 853 Res = Nand(A, Or(B, C)); 854 break; 855 case 0x20: 856 if (ABCIsConst) 857 Res = Nor(Nand(A, C), B); 858 break; 859 case 0x21: 860 if (ABCIsConst) 861 Res = Nor(Xor(A, C), B); 862 break; 863 case 0x22: 864 if (BCIsConst) 865 Res = Nor(B, Not(C)); 866 break; 867 case 0x23: 868 if (ABCIsConst) 869 Res = Nor(B, Nor(C, Not(A))); 870 break; 871 case 0x24: 872 if (ABCIsConst) 873 Res = Nor(Xnor(A, B), Xor(A, C)); 874 break; 875 case 0x25: 876 if (ABCIsConst) 877 Res = Xor(A, Nand(Nand(A, B), C)); 878 break; 879 case 0x26: 880 if (ABCIsConst) 881 Res = And(Nand(A, B), Xor(B, C)); 882 break; 883 case 0x27: 884 if (ABCIsConst) 885 Res = Xor(Or(Xnor(A, B), C), B); 886 break; 887 case 0x28: 888 if (ABCIsConst) 889 Res = And(Xor(A, B), C); 890 break; 891 case 0x29: 892 if (ABCIsConst) 893 Res = Xor(Xor(A, B), Nor(And(A, B), C)); 894 break; 895 case 0x2a: 896 if (ABCIsConst) 897 Res = And(Nand(A, B), C); 898 break; 899 case 0x2b: 900 if (ABCIsConst) 901 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A); 902 break; 903 case 0x2c: 904 if (ABCIsConst) 905 Res = Nor(Xnor(A, B), Nor(B, C)); 906 break; 907 case 0x2d: 908 if (ABCIsConst) 909 Res = Xor(A, Or(B, Not(C))); 910 break; 911 case 0x2e: 912 if (ABCIsConst) 913 Res = Xor(A, Or(Xor(A, C), B)); 914 break; 915 case 0x2f: 916 if (ABCIsConst) 917 Res = Nand(A, Or(B, Not(C))); 918 break; 919 case 0x30: 920 if (ABIsConst) 921 Res = Nor(B, Not(A)); 922 break; 923 case 0x31: 924 if (ABCIsConst) 925 Res = Nor(Nor(A, Not(C)), B); 926 break; 927 case 0x32: 928 if (ABCIsConst) 929 Res = Nor(Nor(A, C), B); 930 break; 931 case 0x33: 932 Res = Not(B); 933 break; 934 case 0x34: 935 if (ABCIsConst) 936 Res = And(Xor(A, B), Nand(B, C)); 937 break; 938 case 0x35: 939 if (ABCIsConst) 940 Res = Xor(B, Or(A, Xnor(B, C))); 941 break; 942 case 0x36: 943 if (ABCIsConst) 944 Res = Xor(Or(A, C), B); 945 break; 946 case 0x37: 947 if (ABCIsConst) 948 Res = Nand(Or(A, C), B); 949 break; 950 case 0x38: 951 if (ABCIsConst) 952 Res = Nor(Xnor(A, B), Nor(A, C)); 953 break; 954 case 0x39: 955 if (ABCIsConst) 956 Res = Xor(Or(A, Not(C)), B); 957 break; 958 case 0x3a: 959 if (ABCIsConst) 960 Res = Xor(B, Or(A, Xor(B, C))); 961 break; 962 case 0x3b: 963 if (ABCIsConst) 964 Res = Nand(Or(A, Not(C)), B); 965 break; 966 case 0x3c: 967 Res = Xor(A, B); 968 break; 969 case 0x3d: 970 if (ABCIsConst) 971 Res = Xor(A, Or(Nor(A, C), B)); 972 break; 973 case 0x3e: 974 if (ABCIsConst) 975 Res = Xor(A, Or(Nor(A, Not(C)), B)); 976 break; 977 case 0x3f: 978 if (ABIsConst) 979 Res = Nand(A, B); 980 break; 981 case 0x40: 982 if (ABCIsConst) 983 Res = Nor(Nand(A, B), C); 984 break; 985 case 0x41: 986 if (ABCIsConst) 987 Res = Nor(Xor(A, B), C); 988 break; 989 case 0x42: 990 if (ABCIsConst) 991 Res = Nor(Xor(A, B), Xnor(A, C)); 992 break; 993 case 0x43: 994 if (ABCIsConst) 995 Res = Xor(A, Nand(Nand(A, C), B)); 996 break; 997 case 0x44: 998 if (BCIsConst) 999 Res = Nor(C, Not(B)); 1000 break; 1001 case 0x45: 1002 if (ABCIsConst) 1003 Res = Nor(Nor(B, Not(A)), C); 1004 break; 1005 case 0x46: 1006 if (ABCIsConst) 1007 Res = Xor(Or(And(A, C), B), C); 1008 break; 1009 case 0x47: 1010 if (ABCIsConst) 1011 Res = Xor(Or(Xnor(A, C), B), C); 1012 break; 1013 case 0x48: 1014 if (ABCIsConst) 1015 Res = And(Xor(A, C), B); 1016 break; 1017 case 0x49: 1018 if (ABCIsConst) 1019 Res = Xor(Or(Xnor(A, B), And(A, C)), C); 1020 break; 1021 case 0x4a: 1022 if (ABCIsConst) 1023 Res = Nor(Xnor(A, C), Nor(B, C)); 1024 break; 1025 case 0x4b: 1026 if (ABCIsConst) 1027 Res = Xor(A, Or(C, Not(B))); 1028 break; 1029 case 0x4c: 1030 if (ABCIsConst) 1031 Res = And(Nand(A, C), B); 1032 break; 1033 case 0x4d: 1034 if (ABCIsConst) 1035 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A); 1036 break; 1037 case 0x4e: 1038 if (ABCIsConst) 1039 Res = Xor(A, Or(Xor(A, B), C)); 1040 break; 1041 case 0x4f: 1042 if (ABCIsConst) 1043 Res = Nand(A, Nand(B, Not(C))); 1044 break; 1045 case 0x50: 1046 if (ACIsConst) 1047 Res = Nor(C, Not(A)); 1048 break; 1049 case 0x51: 1050 if (ABCIsConst) 1051 Res = Nor(Nor(A, Not(B)), C); 1052 break; 1053 case 0x52: 1054 if (ABCIsConst) 1055 Res = And(Xor(A, C), Nand(B, C)); 1056 break; 1057 case 0x53: 1058 if (ABCIsConst) 1059 Res = Xor(Or(Xnor(B, C), A), C); 1060 break; 1061 case 0x54: 1062 if (ABCIsConst) 1063 Res = Nor(Nor(A, B), C); 1064 break; 1065 case 0x55: 1066 Res = Not(C); 1067 break; 1068 case 0x56: 1069 if (ABCIsConst) 1070 Res = Xor(Or(A, B), C); 1071 break; 1072 case 0x57: 1073 if (ABCIsConst) 1074 Res = Nand(Or(A, B), C); 1075 break; 1076 case 0x58: 1077 if (ABCIsConst) 1078 Res = Nor(Nor(A, B), Xnor(A, C)); 1079 break; 1080 case 0x59: 1081 if (ABCIsConst) 1082 Res = Xor(Or(A, Not(B)), C); 1083 break; 1084 case 0x5a: 1085 Res = Xor(A, C); 1086 break; 1087 case 0x5b: 1088 if (ABCIsConst) 1089 Res = Xor(A, Or(Nor(A, B), C)); 1090 break; 1091 case 0x5c: 1092 if (ABCIsConst) 1093 Res = Xor(Or(Xor(B, C), A), C); 1094 break; 1095 case 0x5d: 1096 if (ABCIsConst) 1097 Res = Nand(Or(A, Not(B)), C); 1098 break; 1099 case 0x5e: 1100 if (ABCIsConst) 1101 Res = Xor(A, Or(Nor(A, Not(B)), C)); 1102 break; 1103 case 0x5f: 1104 if (ACIsConst) 1105 Res = Nand(A, C); 1106 break; 1107 case 0x60: 1108 if (ABCIsConst) 1109 Res = And(A, Xor(B, C)); 1110 break; 1111 case 0x61: 1112 if (ABCIsConst) 1113 Res = Xor(Or(Xnor(A, B), And(B, C)), C); 1114 break; 1115 case 0x62: 1116 if (ABCIsConst) 1117 Res = Nor(Nor(A, C), Xnor(B, C)); 1118 break; 1119 case 0x63: 1120 if (ABCIsConst) 1121 Res = Xor(B, Or(C, Not(A))); 1122 break; 1123 case 0x64: 1124 if (ABCIsConst) 1125 Res = Nor(Nor(A, B), Xnor(B, C)); 1126 break; 1127 case 0x65: 1128 if (ABCIsConst) 1129 Res = Xor(Or(B, Not(A)), C); 1130 break; 1131 case 0x66: 1132 Res = Xor(B, C); 1133 break; 1134 case 0x67: 1135 if (ABCIsConst) 1136 Res = Or(Nor(A, B), Xor(B, C)); 1137 break; 1138 case 0x68: 1139 if (ABCIsConst) 1140 Res = Xor(Xor(A, B), Nor(Nor(A, B), C)); 1141 break; 1142 case 0x69: 1143 if (ABCIsConst) 1144 Res = Xor(Xnor(A, B), C); 1145 break; 1146 case 0x6a: 1147 if (ABCIsConst) 1148 Res = Xor(And(A, B), C); 1149 break; 1150 case 0x6b: 1151 if (ABCIsConst) 1152 Res = Or(Nor(A, B), Xor(Xnor(A, B), C)); 1153 break; 1154 case 0x6c: 1155 if (ABCIsConst) 1156 Res = Xor(And(A, C), B); 1157 break; 1158 case 0x6d: 1159 if (ABCIsConst) 1160 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C); 1161 break; 1162 case 0x6e: 1163 if (ABCIsConst) 1164 Res = Or(Nor(A, Not(B)), Xor(B, C)); 1165 break; 1166 case 0x6f: 1167 if (ABCIsConst) 1168 Res = Nand(A, Xnor(B, C)); 1169 break; 1170 case 0x70: 1171 if (ABCIsConst) 1172 Res = And(A, Nand(B, C)); 1173 break; 1174 case 0x71: 1175 if (ABCIsConst) 1176 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A); 1177 break; 1178 case 0x72: 1179 if (ABCIsConst) 1180 Res = Xor(Or(Xor(A, B), C), B); 1181 break; 1182 case 0x73: 1183 if (ABCIsConst) 1184 Res = Nand(Nand(A, Not(C)), B); 1185 break; 1186 case 0x74: 1187 if (ABCIsConst) 1188 Res = Xor(Or(Xor(A, C), B), C); 1189 break; 1190 case 0x75: 1191 if (ABCIsConst) 1192 Res = Nand(Nand(A, Not(B)), C); 1193 break; 1194 case 0x76: 1195 if (ABCIsConst) 1196 Res = Xor(B, Or(Nor(B, Not(A)), C)); 1197 break; 1198 case 0x77: 1199 if (BCIsConst) 1200 Res = Nand(B, C); 1201 break; 1202 case 0x78: 1203 if (ABCIsConst) 1204 Res = Xor(A, And(B, C)); 1205 break; 1206 case 0x79: 1207 if (ABCIsConst) 1208 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C); 1209 break; 1210 case 0x7a: 1211 if (ABCIsConst) 1212 Res = Or(Xor(A, C), Nor(B, Not(A))); 1213 break; 1214 case 0x7b: 1215 if (ABCIsConst) 1216 Res = Nand(Xnor(A, C), B); 1217 break; 1218 case 0x7c: 1219 if (ABCIsConst) 1220 Res = Or(Xor(A, B), Nor(C, Not(A))); 1221 break; 1222 case 0x7d: 1223 if (ABCIsConst) 1224 Res = Nand(Xnor(A, B), C); 1225 break; 1226 case 0x7e: 1227 if (ABCIsConst) 1228 Res = Or(Xor(A, B), Xor(A, C)); 1229 break; 1230 case 0x7f: 1231 if (ABCIsConst) 1232 Res = Nand(And(A, B), C); 1233 break; 1234 case 0x80: 1235 if (ABCIsConst) 1236 Res = And(And(A, B), C); 1237 break; 1238 case 0x81: 1239 if (ABCIsConst) 1240 Res = Nor(Xor(A, B), Xor(A, C)); 1241 break; 1242 case 0x82: 1243 if (ABCIsConst) 1244 Res = And(Xnor(A, B), C); 1245 break; 1246 case 0x83: 1247 if (ABCIsConst) 1248 Res = Nor(Xor(A, B), Nor(C, Not(A))); 1249 break; 1250 case 0x84: 1251 if (ABCIsConst) 1252 Res = And(Xnor(A, C), B); 1253 break; 1254 case 0x85: 1255 if (ABCIsConst) 1256 Res = Nor(Xor(A, C), Nor(B, Not(A))); 1257 break; 1258 case 0x86: 1259 if (ABCIsConst) 1260 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C); 1261 break; 1262 case 0x87: 1263 if (ABCIsConst) 1264 Res = Xor(A, Nand(B, C)); 1265 break; 1266 case 0x88: 1267 Res = And(B, C); 1268 break; 1269 case 0x89: 1270 if (ABCIsConst) 1271 Res = Xor(B, Nor(Nor(B, Not(A)), C)); 1272 break; 1273 case 0x8a: 1274 if (ABCIsConst) 1275 Res = And(Nand(A, Not(B)), C); 1276 break; 1277 case 0x8b: 1278 if (ABCIsConst) 1279 Res = Xor(Nor(Xor(A, C), B), C); 1280 break; 1281 case 0x8c: 1282 if (ABCIsConst) 1283 Res = And(Nand(A, Not(C)), B); 1284 break; 1285 case 0x8d: 1286 if (ABCIsConst) 1287 Res = Xor(Nor(Xor(A, B), C), B); 1288 break; 1289 case 0x8e: 1290 if (ABCIsConst) 1291 Res = Xor(Or(Xor(A, B), Xor(A, C)), A); 1292 break; 1293 case 0x8f: 1294 if (ABCIsConst) 1295 Res = Nand(A, Nand(B, C)); 1296 break; 1297 case 0x90: 1298 if (ABCIsConst) 1299 Res = And(A, Xnor(B, C)); 1300 break; 1301 case 0x91: 1302 if (ABCIsConst) 1303 Res = Nor(Nor(A, Not(B)), Xor(B, C)); 1304 break; 1305 case 0x92: 1306 if (ABCIsConst) 1307 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C); 1308 break; 1309 case 0x93: 1310 if (ABCIsConst) 1311 Res = Xor(Nand(A, C), B); 1312 break; 1313 case 0x94: 1314 if (ABCIsConst) 1315 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C)); 1316 break; 1317 case 0x95: 1318 if (ABCIsConst) 1319 Res = Xor(Nand(A, B), C); 1320 break; 1321 case 0x96: 1322 if (ABCIsConst) 1323 Res = Xor(Xor(A, B), C); 1324 break; 1325 case 0x97: 1326 if (ABCIsConst) 1327 Res = Xor(Xor(A, B), Or(Nor(A, B), C)); 1328 break; 1329 case 0x98: 1330 if (ABCIsConst) 1331 Res = Nor(Nor(A, B), Xor(B, C)); 1332 break; 1333 case 0x99: 1334 if (BCIsConst) 1335 Res = Xnor(B, C); 1336 break; 1337 case 0x9a: 1338 if (ABCIsConst) 1339 Res = Xor(Nor(B, Not(A)), C); 1340 break; 1341 case 0x9b: 1342 if (ABCIsConst) 1343 Res = Or(Nor(A, B), Xnor(B, C)); 1344 break; 1345 case 0x9c: 1346 if (ABCIsConst) 1347 Res = Xor(B, Nor(C, Not(A))); 1348 break; 1349 case 0x9d: 1350 if (ABCIsConst) 1351 Res = Or(Nor(A, C), Xnor(B, C)); 1352 break; 1353 case 0x9e: 1354 if (ABCIsConst) 1355 Res = Xor(And(Xor(A, B), Nand(B, C)), C); 1356 break; 1357 case 0x9f: 1358 if (ABCIsConst) 1359 Res = Nand(A, Xor(B, C)); 1360 break; 1361 case 0xa0: 1362 Res = And(A, C); 1363 break; 1364 case 0xa1: 1365 if (ABCIsConst) 1366 Res = Xor(A, Nor(Nor(A, Not(B)), C)); 1367 break; 1368 case 0xa2: 1369 if (ABCIsConst) 1370 Res = And(Or(A, Not(B)), C); 1371 break; 1372 case 0xa3: 1373 if (ABCIsConst) 1374 Res = Xor(Nor(Xor(B, C), A), C); 1375 break; 1376 case 0xa4: 1377 if (ABCIsConst) 1378 Res = Xor(A, Nor(Nor(A, B), C)); 1379 break; 1380 case 0xa5: 1381 if (ACIsConst) 1382 Res = Xnor(A, C); 1383 break; 1384 case 0xa6: 1385 if (ABCIsConst) 1386 Res = Xor(Nor(A, Not(B)), C); 1387 break; 1388 case 0xa7: 1389 if (ABCIsConst) 1390 Res = Or(Nor(A, B), Xnor(A, C)); 1391 break; 1392 case 0xa8: 1393 if (ABCIsConst) 1394 Res = And(Or(A, B), C); 1395 break; 1396 case 0xa9: 1397 if (ABCIsConst) 1398 Res = Xor(Nor(A, B), C); 1399 break; 1400 case 0xaa: 1401 Res = C; 1402 break; 1403 case 0xab: 1404 if (ABCIsConst) 1405 Res = Or(Nor(A, B), C); 1406 break; 1407 case 0xac: 1408 if (ABCIsConst) 1409 Res = Xor(Nor(Xnor(B, C), A), C); 1410 break; 1411 case 0xad: 1412 if (ABCIsConst) 1413 Res = Or(Xnor(A, C), And(B, C)); 1414 break; 1415 case 0xae: 1416 if (ABCIsConst) 1417 Res = Or(Nor(A, Not(B)), C); 1418 break; 1419 case 0xaf: 1420 if (ACIsConst) 1421 Res = Or(C, Not(A)); 1422 break; 1423 case 0xb0: 1424 if (ABCIsConst) 1425 Res = And(A, Nand(B, Not(C))); 1426 break; 1427 case 0xb1: 1428 if (ABCIsConst) 1429 Res = Xor(A, Nor(Xor(A, B), C)); 1430 break; 1431 case 0xb2: 1432 if (ABCIsConst) 1433 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A); 1434 break; 1435 case 0xb3: 1436 if (ABCIsConst) 1437 Res = Nand(Nand(A, C), B); 1438 break; 1439 case 0xb4: 1440 if (ABCIsConst) 1441 Res = Xor(A, Nor(C, Not(B))); 1442 break; 1443 case 0xb5: 1444 if (ABCIsConst) 1445 Res = Or(Xnor(A, C), Nor(B, C)); 1446 break; 1447 case 0xb6: 1448 if (ABCIsConst) 1449 Res = Xor(And(Xor(A, B), Nand(A, C)), C); 1450 break; 1451 case 0xb7: 1452 if (ABCIsConst) 1453 Res = Nand(Xor(A, C), B); 1454 break; 1455 case 0xb8: 1456 if (ABCIsConst) 1457 Res = Xor(Nor(Xnor(A, C), B), C); 1458 break; 1459 case 0xb9: 1460 if (ABCIsConst) 1461 Res = Xor(Nor(And(A, C), B), C); 1462 break; 1463 case 0xba: 1464 if (ABCIsConst) 1465 Res = Or(Nor(B, Not(A)), C); 1466 break; 1467 case 0xbb: 1468 if (BCIsConst) 1469 Res = Or(C, Not(B)); 1470 break; 1471 case 0xbc: 1472 if (ABCIsConst) 1473 Res = Xor(A, And(Nand(A, C), B)); 1474 break; 1475 case 0xbd: 1476 if (ABCIsConst) 1477 Res = Or(Xor(A, B), Xnor(A, C)); 1478 break; 1479 case 0xbe: 1480 if (ABCIsConst) 1481 Res = Or(Xor(A, B), C); 1482 break; 1483 case 0xbf: 1484 if (ABCIsConst) 1485 Res = Or(Nand(A, B), C); 1486 break; 1487 case 0xc0: 1488 Res = And(A, B); 1489 break; 1490 case 0xc1: 1491 if (ABCIsConst) 1492 Res = Xor(A, Nor(Nor(A, Not(C)), B)); 1493 break; 1494 case 0xc2: 1495 if (ABCIsConst) 1496 Res = Xor(A, Nor(Nor(A, C), B)); 1497 break; 1498 case 0xc3: 1499 if (ABIsConst) 1500 Res = Xnor(A, B); 1501 break; 1502 case 0xc4: 1503 if (ABCIsConst) 1504 Res = And(Or(A, Not(C)), B); 1505 break; 1506 case 0xc5: 1507 if (ABCIsConst) 1508 Res = Xor(B, Nor(A, Xor(B, C))); 1509 break; 1510 case 0xc6: 1511 if (ABCIsConst) 1512 Res = Xor(Nor(A, Not(C)), B); 1513 break; 1514 case 0xc7: 1515 if (ABCIsConst) 1516 Res = Or(Xnor(A, B), Nor(A, C)); 1517 break; 1518 case 0xc8: 1519 if (ABCIsConst) 1520 Res = And(Or(A, C), B); 1521 break; 1522 case 0xc9: 1523 if (ABCIsConst) 1524 Res = Xor(Nor(A, C), B); 1525 break; 1526 case 0xca: 1527 if (ABCIsConst) 1528 Res = Xor(B, Nor(A, Xnor(B, C))); 1529 break; 1530 case 0xcb: 1531 if (ABCIsConst) 1532 Res = Or(Xnor(A, B), And(B, C)); 1533 break; 1534 case 0xcc: 1535 Res = B; 1536 break; 1537 case 0xcd: 1538 if (ABCIsConst) 1539 Res = Or(Nor(A, C), B); 1540 break; 1541 case 0xce: 1542 if (ABCIsConst) 1543 Res = Or(Nor(A, Not(C)), B); 1544 break; 1545 case 0xcf: 1546 if (ABIsConst) 1547 Res = Or(B, Not(A)); 1548 break; 1549 case 0xd0: 1550 if (ABCIsConst) 1551 Res = And(A, Or(B, Not(C))); 1552 break; 1553 case 0xd1: 1554 if (ABCIsConst) 1555 Res = Xor(A, Nor(Xor(A, C), B)); 1556 break; 1557 case 0xd2: 1558 if (ABCIsConst) 1559 Res = Xor(A, Nor(B, Not(C))); 1560 break; 1561 case 0xd3: 1562 if (ABCIsConst) 1563 Res = Or(Xnor(A, B), Nor(B, C)); 1564 break; 1565 case 0xd4: 1566 if (ABCIsConst) 1567 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A); 1568 break; 1569 case 0xd5: 1570 if (ABCIsConst) 1571 Res = Nand(Nand(A, B), C); 1572 break; 1573 case 0xd6: 1574 if (ABCIsConst) 1575 Res = Xor(Xor(A, B), Or(And(A, B), C)); 1576 break; 1577 case 0xd7: 1578 if (ABCIsConst) 1579 Res = Nand(Xor(A, B), C); 1580 break; 1581 case 0xd8: 1582 if (ABCIsConst) 1583 Res = Xor(Nor(Xnor(A, B), C), B); 1584 break; 1585 case 0xd9: 1586 if (ABCIsConst) 1587 Res = Or(And(A, B), Xnor(B, C)); 1588 break; 1589 case 0xda: 1590 if (ABCIsConst) 1591 Res = Xor(A, And(Nand(A, B), C)); 1592 break; 1593 case 0xdb: 1594 if (ABCIsConst) 1595 Res = Or(Xnor(A, B), Xor(A, C)); 1596 break; 1597 case 0xdc: 1598 if (ABCIsConst) 1599 Res = Or(B, Nor(C, Not(A))); 1600 break; 1601 case 0xdd: 1602 if (BCIsConst) 1603 Res = Or(B, Not(C)); 1604 break; 1605 case 0xde: 1606 if (ABCIsConst) 1607 Res = Or(Xor(A, C), B); 1608 break; 1609 case 0xdf: 1610 if (ABCIsConst) 1611 Res = Or(Nand(A, C), B); 1612 break; 1613 case 0xe0: 1614 if (ABCIsConst) 1615 Res = And(A, Or(B, C)); 1616 break; 1617 case 0xe1: 1618 if (ABCIsConst) 1619 Res = Xor(A, Nor(B, C)); 1620 break; 1621 case 0xe2: 1622 if (ABCIsConst) 1623 Res = Xor(A, Nor(Xnor(A, C), B)); 1624 break; 1625 case 0xe3: 1626 if (ABCIsConst) 1627 Res = Xor(A, Nor(And(A, C), B)); 1628 break; 1629 case 0xe4: 1630 if (ABCIsConst) 1631 Res = Xor(A, Nor(Xnor(A, B), C)); 1632 break; 1633 case 0xe5: 1634 if (ABCIsConst) 1635 Res = Xor(A, Nor(And(A, B), C)); 1636 break; 1637 case 0xe6: 1638 if (ABCIsConst) 1639 Res = Or(And(A, B), Xor(B, C)); 1640 break; 1641 case 0xe7: 1642 if (ABCIsConst) 1643 Res = Or(Xnor(A, B), Xnor(A, C)); 1644 break; 1645 case 0xe8: 1646 if (ABCIsConst) 1647 Res = Xor(Or(A, B), Nor(Xnor(A, B), C)); 1648 break; 1649 case 0xe9: 1650 if (ABCIsConst) 1651 Res = Xor(Xor(A, B), Nand(Nand(A, B), C)); 1652 break; 1653 case 0xea: 1654 if (ABCIsConst) 1655 Res = Or(And(A, B), C); 1656 break; 1657 case 0xeb: 1658 if (ABCIsConst) 1659 Res = Or(Xnor(A, B), C); 1660 break; 1661 case 0xec: 1662 if (ABCIsConst) 1663 Res = Or(And(A, C), B); 1664 break; 1665 case 0xed: 1666 if (ABCIsConst) 1667 Res = Or(Xnor(A, C), B); 1668 break; 1669 case 0xee: 1670 Res = Or(B, C); 1671 break; 1672 case 0xef: 1673 if (ABCIsConst) 1674 Res = Nand(A, Nor(B, C)); 1675 break; 1676 case 0xf0: 1677 Res = A; 1678 break; 1679 case 0xf1: 1680 if (ABCIsConst) 1681 Res = Or(A, Nor(B, C)); 1682 break; 1683 case 0xf2: 1684 if (ABCIsConst) 1685 Res = Or(A, Nor(B, Not(C))); 1686 break; 1687 case 0xf3: 1688 if (ABIsConst) 1689 Res = Or(A, Not(B)); 1690 break; 1691 case 0xf4: 1692 if (ABCIsConst) 1693 Res = Or(A, Nor(C, Not(B))); 1694 break; 1695 case 0xf5: 1696 if (ACIsConst) 1697 Res = Or(A, Not(C)); 1698 break; 1699 case 0xf6: 1700 if (ABCIsConst) 1701 Res = Or(A, Xor(B, C)); 1702 break; 1703 case 0xf7: 1704 if (ABCIsConst) 1705 Res = Or(A, Nand(B, C)); 1706 break; 1707 case 0xf8: 1708 if (ABCIsConst) 1709 Res = Or(A, And(B, C)); 1710 break; 1711 case 0xf9: 1712 if (ABCIsConst) 1713 Res = Or(A, Xnor(B, C)); 1714 break; 1715 case 0xfa: 1716 Res = Or(A, C); 1717 break; 1718 case 0xfb: 1719 if (ABCIsConst) 1720 Res = Nand(Nor(A, C), B); 1721 break; 1722 case 0xfc: 1723 Res = Or(A, B); 1724 break; 1725 case 0xfd: 1726 if (ABCIsConst) 1727 Res = Nand(Nor(A, B), C); 1728 break; 1729 case 0xfe: 1730 if (ABCIsConst) 1731 Res = Or(Or(A, B), C); 1732 break; 1733 case 0xff: 1734 Res = {Constant::getAllOnesValue(Ty), 0xff}; 1735 break; 1736 } 1737 1738 assert((Res.first == nullptr || Res.second == Imm) && 1739 "Simplification of ternary logic does not verify!"); 1740 return Res.first; 1741 } 1742 1743 static Value *simplifyX86insertps(const IntrinsicInst &II, 1744 InstCombiner::BuilderTy &Builder) { 1745 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1746 if (!CInt) 1747 return nullptr; 1748 1749 auto *VecTy = cast<FixedVectorType>(II.getType()); 1750 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 1751 1752 // The immediate permute control byte looks like this: 1753 // [3:0] - zero mask for each 32-bit lane 1754 // [5:4] - select one 32-bit destination lane 1755 // [7:6] - select one 32-bit source lane 1756 1757 uint8_t Imm = CInt->getZExtValue(); 1758 uint8_t ZMask = Imm & 0xf; 1759 uint8_t DestLane = (Imm >> 4) & 0x3; 1760 uint8_t SourceLane = (Imm >> 6) & 0x3; 1761 1762 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 1763 1764 // If all zero mask bits are set, this was just a weird way to 1765 // generate a zero vector. 1766 if (ZMask == 0xf) 1767 return ZeroVector; 1768 1769 // Initialize by passing all of the first source bits through. 1770 int ShuffleMask[4] = {0, 1, 2, 3}; 1771 1772 // We may replace the second operand with the zero vector. 1773 Value *V1 = II.getArgOperand(1); 1774 1775 if (ZMask) { 1776 // If the zero mask is being used with a single input or the zero mask 1777 // overrides the destination lane, this is a shuffle with the zero vector. 1778 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 1779 (ZMask & (1 << DestLane))) { 1780 V1 = ZeroVector; 1781 // We may still move 32-bits of the first source vector from one lane 1782 // to another. 1783 ShuffleMask[DestLane] = SourceLane; 1784 // The zero mask may override the previous insert operation. 1785 for (unsigned i = 0; i < 4; ++i) 1786 if ((ZMask >> i) & 0x1) 1787 ShuffleMask[i] = i + 4; 1788 } else { 1789 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 1790 return nullptr; 1791 } 1792 } else { 1793 // Replace the selected destination lane with the selected source lane. 1794 ShuffleMask[DestLane] = SourceLane + 4; 1795 } 1796 1797 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 1798 } 1799 1800 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 1801 /// or conversion to a shuffle vector. 1802 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 1803 ConstantInt *CILength, ConstantInt *CIIndex, 1804 InstCombiner::BuilderTy &Builder) { 1805 auto LowConstantHighUndef = [&](uint64_t Val) { 1806 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1807 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 1808 UndefValue::get(IntTy64)}; 1809 return ConstantVector::get(Args); 1810 }; 1811 1812 // See if we're dealing with constant values. 1813 auto *C0 = dyn_cast<Constant>(Op0); 1814 auto *CI0 = 1815 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1816 : nullptr; 1817 1818 // Attempt to constant fold. 1819 if (CILength && CIIndex) { 1820 // From AMD documentation: "The bit index and field length are each six 1821 // bits in length other bits of the field are ignored." 1822 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 1823 APInt APLength = CILength->getValue().zextOrTrunc(6); 1824 1825 unsigned Index = APIndex.getZExtValue(); 1826 1827 // From AMD documentation: "a value of zero in the field length is 1828 // defined as length of 64". 1829 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1830 1831 // From AMD documentation: "If the sum of the bit index + length field 1832 // is greater than 64, the results are undefined". 1833 unsigned End = Index + Length; 1834 1835 // Note that both field index and field length are 8-bit quantities. 1836 // Since variables 'Index' and 'Length' are unsigned values 1837 // obtained from zero-extending field index and field length 1838 // respectively, their sum should never wrap around. 1839 if (End > 64) 1840 return UndefValue::get(II.getType()); 1841 1842 // If we are inserting whole bytes, we can convert this to a shuffle. 1843 // Lowering can recognize EXTRQI shuffle masks. 1844 if ((Length % 8) == 0 && (Index % 8) == 0) { 1845 // Convert bit indices to byte indices. 1846 Length /= 8; 1847 Index /= 8; 1848 1849 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1850 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1851 1852 SmallVector<int, 16> ShuffleMask; 1853 for (int i = 0; i != (int)Length; ++i) 1854 ShuffleMask.push_back(i + Index); 1855 for (int i = Length; i != 8; ++i) 1856 ShuffleMask.push_back(i + 16); 1857 for (int i = 8; i != 16; ++i) 1858 ShuffleMask.push_back(-1); 1859 1860 Value *SV = Builder.CreateShuffleVector( 1861 Builder.CreateBitCast(Op0, ShufTy), 1862 ConstantAggregateZero::get(ShufTy), ShuffleMask); 1863 return Builder.CreateBitCast(SV, II.getType()); 1864 } 1865 1866 // Constant Fold - shift Index'th bit to lowest position and mask off 1867 // Length bits. 1868 if (CI0) { 1869 APInt Elt = CI0->getValue(); 1870 Elt.lshrInPlace(Index); 1871 Elt = Elt.zextOrTrunc(Length); 1872 return LowConstantHighUndef(Elt.getZExtValue()); 1873 } 1874 1875 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 1876 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 1877 Value *Args[] = {Op0, CILength, CIIndex}; 1878 Module *M = II.getModule(); 1879 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 1880 return Builder.CreateCall(F, Args); 1881 } 1882 } 1883 1884 // Constant Fold - extraction from zero is always {zero, undef}. 1885 if (CI0 && CI0->isZero()) 1886 return LowConstantHighUndef(0); 1887 1888 return nullptr; 1889 } 1890 1891 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 1892 /// folding or conversion to a shuffle vector. 1893 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 1894 APInt APLength, APInt APIndex, 1895 InstCombiner::BuilderTy &Builder) { 1896 // From AMD documentation: "The bit index and field length are each six bits 1897 // in length other bits of the field are ignored." 1898 APIndex = APIndex.zextOrTrunc(6); 1899 APLength = APLength.zextOrTrunc(6); 1900 1901 // Attempt to constant fold. 1902 unsigned Index = APIndex.getZExtValue(); 1903 1904 // From AMD documentation: "a value of zero in the field length is 1905 // defined as length of 64". 1906 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1907 1908 // From AMD documentation: "If the sum of the bit index + length field 1909 // is greater than 64, the results are undefined". 1910 unsigned End = Index + Length; 1911 1912 // Note that both field index and field length are 8-bit quantities. 1913 // Since variables 'Index' and 'Length' are unsigned values 1914 // obtained from zero-extending field index and field length 1915 // respectively, their sum should never wrap around. 1916 if (End > 64) 1917 return UndefValue::get(II.getType()); 1918 1919 // If we are inserting whole bytes, we can convert this to a shuffle. 1920 // Lowering can recognize INSERTQI shuffle masks. 1921 if ((Length % 8) == 0 && (Index % 8) == 0) { 1922 // Convert bit indices to byte indices. 1923 Length /= 8; 1924 Index /= 8; 1925 1926 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1927 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1928 1929 SmallVector<int, 16> ShuffleMask; 1930 for (int i = 0; i != (int)Index; ++i) 1931 ShuffleMask.push_back(i); 1932 for (int i = 0; i != (int)Length; ++i) 1933 ShuffleMask.push_back(i + 16); 1934 for (int i = Index + Length; i != 8; ++i) 1935 ShuffleMask.push_back(i); 1936 for (int i = 8; i != 16; ++i) 1937 ShuffleMask.push_back(-1); 1938 1939 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 1940 Builder.CreateBitCast(Op1, ShufTy), 1941 ShuffleMask); 1942 return Builder.CreateBitCast(SV, II.getType()); 1943 } 1944 1945 // See if we're dealing with constant values. 1946 auto *C0 = dyn_cast<Constant>(Op0); 1947 auto *C1 = dyn_cast<Constant>(Op1); 1948 auto *CI00 = 1949 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1950 : nullptr; 1951 auto *CI10 = 1952 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1953 : nullptr; 1954 1955 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 1956 if (CI00 && CI10) { 1957 APInt V00 = CI00->getValue(); 1958 APInt V10 = CI10->getValue(); 1959 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 1960 V00 = V00 & ~Mask; 1961 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 1962 APInt Val = V00 | V10; 1963 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1964 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 1965 UndefValue::get(IntTy64)}; 1966 return ConstantVector::get(Args); 1967 } 1968 1969 // If we were an INSERTQ call, we'll save demanded elements if we convert to 1970 // INSERTQI. 1971 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 1972 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1973 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 1974 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 1975 1976 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 1977 Module *M = II.getModule(); 1978 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 1979 return Builder.CreateCall(F, Args); 1980 } 1981 1982 return nullptr; 1983 } 1984 1985 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 1986 static Value *simplifyX86pshufb(const IntrinsicInst &II, 1987 InstCombiner::BuilderTy &Builder) { 1988 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1989 if (!V) 1990 return nullptr; 1991 1992 auto *VecTy = cast<FixedVectorType>(II.getType()); 1993 unsigned NumElts = VecTy->getNumElements(); 1994 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 1995 "Unexpected number of elements in shuffle mask!"); 1996 1997 // Construct a shuffle mask from constant integers or UNDEFs. 1998 int Indexes[64]; 1999 2000 // Each byte in the shuffle control mask forms an index to permute the 2001 // corresponding byte in the destination operand. 2002 for (unsigned I = 0; I < NumElts; ++I) { 2003 Constant *COp = V->getAggregateElement(I); 2004 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2005 return nullptr; 2006 2007 if (isa<UndefValue>(COp)) { 2008 Indexes[I] = -1; 2009 continue; 2010 } 2011 2012 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 2013 2014 // If the most significant bit (bit[7]) of each byte of the shuffle 2015 // control mask is set, then zero is written in the result byte. 2016 // The zero vector is in the right-hand side of the resulting 2017 // shufflevector. 2018 2019 // The value of each index for the high 128-bit lane is the least 2020 // significant 4 bits of the respective shuffle control byte. 2021 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 2022 Indexes[I] = Index; 2023 } 2024 2025 auto V1 = II.getArgOperand(0); 2026 auto V2 = Constant::getNullValue(VecTy); 2027 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts)); 2028 } 2029 2030 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 2031 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 2032 InstCombiner::BuilderTy &Builder) { 2033 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2034 if (!V) 2035 return nullptr; 2036 2037 auto *VecTy = cast<FixedVectorType>(II.getType()); 2038 unsigned NumElts = VecTy->getNumElements(); 2039 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 2040 unsigned NumLaneElts = IsPD ? 2 : 4; 2041 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 2042 2043 // Construct a shuffle mask from constant integers or UNDEFs. 2044 int Indexes[16]; 2045 2046 // The intrinsics only read one or two bits, clear the rest. 2047 for (unsigned I = 0; I < NumElts; ++I) { 2048 Constant *COp = V->getAggregateElement(I); 2049 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2050 return nullptr; 2051 2052 if (isa<UndefValue>(COp)) { 2053 Indexes[I] = -1; 2054 continue; 2055 } 2056 2057 APInt Index = cast<ConstantInt>(COp)->getValue(); 2058 Index = Index.zextOrTrunc(32).getLoBits(2); 2059 2060 // The PD variants uses bit 1 to select per-lane element index, so 2061 // shift down to convert to generic shuffle mask index. 2062 if (IsPD) 2063 Index.lshrInPlace(1); 2064 2065 // The _256 variants are a bit trickier since the mask bits always index 2066 // into the corresponding 128 half. In order to convert to a generic 2067 // shuffle, we have to make that explicit. 2068 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 2069 2070 Indexes[I] = Index.getZExtValue(); 2071 } 2072 2073 auto V1 = II.getArgOperand(0); 2074 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts)); 2075 } 2076 2077 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 2078 static Value *simplifyX86vpermv(const IntrinsicInst &II, 2079 InstCombiner::BuilderTy &Builder) { 2080 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2081 if (!V) 2082 return nullptr; 2083 2084 auto *VecTy = cast<FixedVectorType>(II.getType()); 2085 unsigned Size = VecTy->getNumElements(); 2086 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 2087 "Unexpected shuffle mask size"); 2088 2089 // Construct a shuffle mask from constant integers or UNDEFs. 2090 int Indexes[64]; 2091 2092 for (unsigned I = 0; I < Size; ++I) { 2093 Constant *COp = V->getAggregateElement(I); 2094 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2095 return nullptr; 2096 2097 if (isa<UndefValue>(COp)) { 2098 Indexes[I] = -1; 2099 continue; 2100 } 2101 2102 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 2103 Index &= Size - 1; 2104 Indexes[I] = Index; 2105 } 2106 2107 auto V1 = II.getArgOperand(0); 2108 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size)); 2109 } 2110 2111 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant. 2112 static Value *simplifyX86vpermv3(const IntrinsicInst &II, 2113 InstCombiner::BuilderTy &Builder) { 2114 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2115 if (!V) 2116 return nullptr; 2117 2118 auto *VecTy = cast<FixedVectorType>(II.getType()); 2119 unsigned Size = VecTy->getNumElements(); 2120 assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 || 2121 Size == 64) && 2122 "Unexpected shuffle mask size"); 2123 2124 // Construct a shuffle mask from constant integers or UNDEFs. 2125 int Indexes[64]; 2126 2127 for (unsigned I = 0; I < Size; ++I) { 2128 Constant *COp = V->getAggregateElement(I); 2129 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2130 return nullptr; 2131 2132 if (isa<UndefValue>(COp)) { 2133 Indexes[I] = -1; 2134 continue; 2135 } 2136 2137 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 2138 Index &= (2 * Size) - 1; 2139 Indexes[I] = Index; 2140 } 2141 2142 auto V1 = II.getArgOperand(0); 2143 auto V2 = II.getArgOperand(2); 2144 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size)); 2145 } 2146 2147 std::optional<Instruction *> 2148 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 2149 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 2150 unsigned DemandedWidth) { 2151 APInt UndefElts(Width, 0); 2152 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 2153 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 2154 }; 2155 2156 Intrinsic::ID IID = II.getIntrinsicID(); 2157 switch (IID) { 2158 case Intrinsic::x86_bmi_bextr_32: 2159 case Intrinsic::x86_bmi_bextr_64: 2160 case Intrinsic::x86_tbm_bextri_u32: 2161 case Intrinsic::x86_tbm_bextri_u64: 2162 // If the RHS is a constant we can try some simplifications. 2163 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2164 uint64_t Shift = C->getZExtValue(); 2165 uint64_t Length = (Shift >> 8) & 0xff; 2166 Shift &= 0xff; 2167 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2168 // If the length is 0 or the shift is out of range, replace with zero. 2169 if (Length == 0 || Shift >= BitWidth) { 2170 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2171 } 2172 // If the LHS is also a constant, we can completely constant fold this. 2173 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2174 uint64_t Result = InC->getZExtValue() >> Shift; 2175 if (Length > BitWidth) 2176 Length = BitWidth; 2177 Result &= maskTrailingOnes<uint64_t>(Length); 2178 return IC.replaceInstUsesWith(II, 2179 ConstantInt::get(II.getType(), Result)); 2180 } 2181 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2182 // are only masking bits that a shift already cleared? 2183 } 2184 break; 2185 2186 case Intrinsic::x86_bmi_bzhi_32: 2187 case Intrinsic::x86_bmi_bzhi_64: 2188 // If the RHS is a constant we can try some simplifications. 2189 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2190 uint64_t Index = C->getZExtValue() & 0xff; 2191 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2192 if (Index >= BitWidth) { 2193 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2194 } 2195 if (Index == 0) { 2196 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2197 } 2198 // If the LHS is also a constant, we can completely constant fold this. 2199 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2200 uint64_t Result = InC->getZExtValue(); 2201 Result &= maskTrailingOnes<uint64_t>(Index); 2202 return IC.replaceInstUsesWith(II, 2203 ConstantInt::get(II.getType(), Result)); 2204 } 2205 // TODO should we convert this to an AND if the RHS is constant? 2206 } 2207 break; 2208 case Intrinsic::x86_bmi_pext_32: 2209 case Intrinsic::x86_bmi_pext_64: 2210 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2211 if (MaskC->isNullValue()) { 2212 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2213 } 2214 if (MaskC->isAllOnesValue()) { 2215 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2216 } 2217 2218 unsigned MaskIdx, MaskLen; 2219 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2220 // any single contingous sequence of 1s anywhere in the mask simply 2221 // describes a subset of the input bits shifted to the appropriate 2222 // position. Replace with the straight forward IR. 2223 Value *Input = II.getArgOperand(0); 2224 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 2225 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2226 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 2227 return IC.replaceInstUsesWith(II, Shifted); 2228 } 2229 2230 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2231 uint64_t Src = SrcC->getZExtValue(); 2232 uint64_t Mask = MaskC->getZExtValue(); 2233 uint64_t Result = 0; 2234 uint64_t BitToSet = 1; 2235 2236 while (Mask) { 2237 // Isolate lowest set bit. 2238 uint64_t BitToTest = Mask & -Mask; 2239 if (BitToTest & Src) 2240 Result |= BitToSet; 2241 2242 BitToSet <<= 1; 2243 // Clear lowest set bit. 2244 Mask &= Mask - 1; 2245 } 2246 2247 return IC.replaceInstUsesWith(II, 2248 ConstantInt::get(II.getType(), Result)); 2249 } 2250 } 2251 break; 2252 case Intrinsic::x86_bmi_pdep_32: 2253 case Intrinsic::x86_bmi_pdep_64: 2254 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2255 if (MaskC->isNullValue()) { 2256 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2257 } 2258 if (MaskC->isAllOnesValue()) { 2259 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2260 } 2261 2262 unsigned MaskIdx, MaskLen; 2263 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2264 // any single contingous sequence of 1s anywhere in the mask simply 2265 // describes a subset of the input bits shifted to the appropriate 2266 // position. Replace with the straight forward IR. 2267 Value *Input = II.getArgOperand(0); 2268 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2269 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 2270 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 2271 return IC.replaceInstUsesWith(II, Masked); 2272 } 2273 2274 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2275 uint64_t Src = SrcC->getZExtValue(); 2276 uint64_t Mask = MaskC->getZExtValue(); 2277 uint64_t Result = 0; 2278 uint64_t BitToTest = 1; 2279 2280 while (Mask) { 2281 // Isolate lowest set bit. 2282 uint64_t BitToSet = Mask & -Mask; 2283 if (BitToTest & Src) 2284 Result |= BitToSet; 2285 2286 BitToTest <<= 1; 2287 // Clear lowest set bit; 2288 Mask &= Mask - 1; 2289 } 2290 2291 return IC.replaceInstUsesWith(II, 2292 ConstantInt::get(II.getType(), Result)); 2293 } 2294 } 2295 break; 2296 2297 case Intrinsic::x86_sse_cvtss2si: 2298 case Intrinsic::x86_sse_cvtss2si64: 2299 case Intrinsic::x86_sse_cvttss2si: 2300 case Intrinsic::x86_sse_cvttss2si64: 2301 case Intrinsic::x86_sse2_cvtsd2si: 2302 case Intrinsic::x86_sse2_cvtsd2si64: 2303 case Intrinsic::x86_sse2_cvttsd2si: 2304 case Intrinsic::x86_sse2_cvttsd2si64: 2305 case Intrinsic::x86_avx512_vcvtss2si32: 2306 case Intrinsic::x86_avx512_vcvtss2si64: 2307 case Intrinsic::x86_avx512_vcvtss2usi32: 2308 case Intrinsic::x86_avx512_vcvtss2usi64: 2309 case Intrinsic::x86_avx512_vcvtsd2si32: 2310 case Intrinsic::x86_avx512_vcvtsd2si64: 2311 case Intrinsic::x86_avx512_vcvtsd2usi32: 2312 case Intrinsic::x86_avx512_vcvtsd2usi64: 2313 case Intrinsic::x86_avx512_cvttss2si: 2314 case Intrinsic::x86_avx512_cvttss2si64: 2315 case Intrinsic::x86_avx512_cvttss2usi: 2316 case Intrinsic::x86_avx512_cvttss2usi64: 2317 case Intrinsic::x86_avx512_cvttsd2si: 2318 case Intrinsic::x86_avx512_cvttsd2si64: 2319 case Intrinsic::x86_avx512_cvttsd2usi: 2320 case Intrinsic::x86_avx512_cvttsd2usi64: { 2321 // These intrinsics only demand the 0th element of their input vectors. If 2322 // we can simplify the input based on that, do so now. 2323 Value *Arg = II.getArgOperand(0); 2324 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 2325 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2326 return IC.replaceOperand(II, 0, V); 2327 } 2328 break; 2329 } 2330 2331 case Intrinsic::x86_mmx_pmovmskb: 2332 case Intrinsic::x86_sse_movmsk_ps: 2333 case Intrinsic::x86_sse2_movmsk_pd: 2334 case Intrinsic::x86_sse2_pmovmskb_128: 2335 case Intrinsic::x86_avx_movmsk_pd_256: 2336 case Intrinsic::x86_avx_movmsk_ps_256: 2337 case Intrinsic::x86_avx2_pmovmskb: 2338 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 2339 return IC.replaceInstUsesWith(II, V); 2340 } 2341 break; 2342 2343 case Intrinsic::x86_sse_comieq_ss: 2344 case Intrinsic::x86_sse_comige_ss: 2345 case Intrinsic::x86_sse_comigt_ss: 2346 case Intrinsic::x86_sse_comile_ss: 2347 case Intrinsic::x86_sse_comilt_ss: 2348 case Intrinsic::x86_sse_comineq_ss: 2349 case Intrinsic::x86_sse_ucomieq_ss: 2350 case Intrinsic::x86_sse_ucomige_ss: 2351 case Intrinsic::x86_sse_ucomigt_ss: 2352 case Intrinsic::x86_sse_ucomile_ss: 2353 case Intrinsic::x86_sse_ucomilt_ss: 2354 case Intrinsic::x86_sse_ucomineq_ss: 2355 case Intrinsic::x86_sse2_comieq_sd: 2356 case Intrinsic::x86_sse2_comige_sd: 2357 case Intrinsic::x86_sse2_comigt_sd: 2358 case Intrinsic::x86_sse2_comile_sd: 2359 case Intrinsic::x86_sse2_comilt_sd: 2360 case Intrinsic::x86_sse2_comineq_sd: 2361 case Intrinsic::x86_sse2_ucomieq_sd: 2362 case Intrinsic::x86_sse2_ucomige_sd: 2363 case Intrinsic::x86_sse2_ucomigt_sd: 2364 case Intrinsic::x86_sse2_ucomile_sd: 2365 case Intrinsic::x86_sse2_ucomilt_sd: 2366 case Intrinsic::x86_sse2_ucomineq_sd: 2367 case Intrinsic::x86_avx512_vcomi_ss: 2368 case Intrinsic::x86_avx512_vcomi_sd: 2369 case Intrinsic::x86_avx512_mask_cmp_ss: 2370 case Intrinsic::x86_avx512_mask_cmp_sd: { 2371 // These intrinsics only demand the 0th element of their input vectors. If 2372 // we can simplify the input based on that, do so now. 2373 bool MadeChange = false; 2374 Value *Arg0 = II.getArgOperand(0); 2375 Value *Arg1 = II.getArgOperand(1); 2376 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2377 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2378 IC.replaceOperand(II, 0, V); 2379 MadeChange = true; 2380 } 2381 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2382 IC.replaceOperand(II, 1, V); 2383 MadeChange = true; 2384 } 2385 if (MadeChange) { 2386 return &II; 2387 } 2388 break; 2389 } 2390 2391 case Intrinsic::x86_avx512_add_ps_512: 2392 case Intrinsic::x86_avx512_div_ps_512: 2393 case Intrinsic::x86_avx512_mul_ps_512: 2394 case Intrinsic::x86_avx512_sub_ps_512: 2395 case Intrinsic::x86_avx512_add_pd_512: 2396 case Intrinsic::x86_avx512_div_pd_512: 2397 case Intrinsic::x86_avx512_mul_pd_512: 2398 case Intrinsic::x86_avx512_sub_pd_512: 2399 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2400 // IR operations. 2401 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2402 if (R->getValue() == 4) { 2403 Value *Arg0 = II.getArgOperand(0); 2404 Value *Arg1 = II.getArgOperand(1); 2405 2406 Value *V; 2407 switch (IID) { 2408 default: 2409 llvm_unreachable("Case stmts out of sync!"); 2410 case Intrinsic::x86_avx512_add_ps_512: 2411 case Intrinsic::x86_avx512_add_pd_512: 2412 V = IC.Builder.CreateFAdd(Arg0, Arg1); 2413 break; 2414 case Intrinsic::x86_avx512_sub_ps_512: 2415 case Intrinsic::x86_avx512_sub_pd_512: 2416 V = IC.Builder.CreateFSub(Arg0, Arg1); 2417 break; 2418 case Intrinsic::x86_avx512_mul_ps_512: 2419 case Intrinsic::x86_avx512_mul_pd_512: 2420 V = IC.Builder.CreateFMul(Arg0, Arg1); 2421 break; 2422 case Intrinsic::x86_avx512_div_ps_512: 2423 case Intrinsic::x86_avx512_div_pd_512: 2424 V = IC.Builder.CreateFDiv(Arg0, Arg1); 2425 break; 2426 } 2427 2428 return IC.replaceInstUsesWith(II, V); 2429 } 2430 } 2431 break; 2432 2433 case Intrinsic::x86_avx512_mask_add_ss_round: 2434 case Intrinsic::x86_avx512_mask_div_ss_round: 2435 case Intrinsic::x86_avx512_mask_mul_ss_round: 2436 case Intrinsic::x86_avx512_mask_sub_ss_round: 2437 case Intrinsic::x86_avx512_mask_add_sd_round: 2438 case Intrinsic::x86_avx512_mask_div_sd_round: 2439 case Intrinsic::x86_avx512_mask_mul_sd_round: 2440 case Intrinsic::x86_avx512_mask_sub_sd_round: 2441 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2442 // IR operations. 2443 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 2444 if (R->getValue() == 4) { 2445 // Extract the element as scalars. 2446 Value *Arg0 = II.getArgOperand(0); 2447 Value *Arg1 = II.getArgOperand(1); 2448 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 2449 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 2450 2451 Value *V; 2452 switch (IID) { 2453 default: 2454 llvm_unreachable("Case stmts out of sync!"); 2455 case Intrinsic::x86_avx512_mask_add_ss_round: 2456 case Intrinsic::x86_avx512_mask_add_sd_round: 2457 V = IC.Builder.CreateFAdd(LHS, RHS); 2458 break; 2459 case Intrinsic::x86_avx512_mask_sub_ss_round: 2460 case Intrinsic::x86_avx512_mask_sub_sd_round: 2461 V = IC.Builder.CreateFSub(LHS, RHS); 2462 break; 2463 case Intrinsic::x86_avx512_mask_mul_ss_round: 2464 case Intrinsic::x86_avx512_mask_mul_sd_round: 2465 V = IC.Builder.CreateFMul(LHS, RHS); 2466 break; 2467 case Intrinsic::x86_avx512_mask_div_ss_round: 2468 case Intrinsic::x86_avx512_mask_div_sd_round: 2469 V = IC.Builder.CreateFDiv(LHS, RHS); 2470 break; 2471 } 2472 2473 // Handle the masking aspect of the intrinsic. 2474 Value *Mask = II.getArgOperand(3); 2475 auto *C = dyn_cast<ConstantInt>(Mask); 2476 // We don't need a select if we know the mask bit is a 1. 2477 if (!C || !C->getValue()[0]) { 2478 // Cast the mask to an i1 vector and then extract the lowest element. 2479 auto *MaskTy = FixedVectorType::get( 2480 IC.Builder.getInt1Ty(), 2481 cast<IntegerType>(Mask->getType())->getBitWidth()); 2482 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 2483 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 2484 // Extract the lowest element from the passthru operand. 2485 Value *Passthru = 2486 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 2487 V = IC.Builder.CreateSelect(Mask, V, Passthru); 2488 } 2489 2490 // Insert the result back into the original argument 0. 2491 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2492 2493 return IC.replaceInstUsesWith(II, V); 2494 } 2495 } 2496 break; 2497 2498 // Constant fold ashr( <A x Bi>, Ci ). 2499 // Constant fold lshr( <A x Bi>, Ci ). 2500 // Constant fold shl( <A x Bi>, Ci ). 2501 case Intrinsic::x86_sse2_psrai_d: 2502 case Intrinsic::x86_sse2_psrai_w: 2503 case Intrinsic::x86_avx2_psrai_d: 2504 case Intrinsic::x86_avx2_psrai_w: 2505 case Intrinsic::x86_avx512_psrai_q_128: 2506 case Intrinsic::x86_avx512_psrai_q_256: 2507 case Intrinsic::x86_avx512_psrai_d_512: 2508 case Intrinsic::x86_avx512_psrai_q_512: 2509 case Intrinsic::x86_avx512_psrai_w_512: 2510 case Intrinsic::x86_sse2_psrli_d: 2511 case Intrinsic::x86_sse2_psrli_q: 2512 case Intrinsic::x86_sse2_psrli_w: 2513 case Intrinsic::x86_avx2_psrli_d: 2514 case Intrinsic::x86_avx2_psrli_q: 2515 case Intrinsic::x86_avx2_psrli_w: 2516 case Intrinsic::x86_avx512_psrli_d_512: 2517 case Intrinsic::x86_avx512_psrli_q_512: 2518 case Intrinsic::x86_avx512_psrli_w_512: 2519 case Intrinsic::x86_sse2_pslli_d: 2520 case Intrinsic::x86_sse2_pslli_q: 2521 case Intrinsic::x86_sse2_pslli_w: 2522 case Intrinsic::x86_avx2_pslli_d: 2523 case Intrinsic::x86_avx2_pslli_q: 2524 case Intrinsic::x86_avx2_pslli_w: 2525 case Intrinsic::x86_avx512_pslli_d_512: 2526 case Intrinsic::x86_avx512_pslli_q_512: 2527 case Intrinsic::x86_avx512_pslli_w_512: 2528 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2529 return IC.replaceInstUsesWith(II, V); 2530 } 2531 break; 2532 2533 case Intrinsic::x86_sse2_psra_d: 2534 case Intrinsic::x86_sse2_psra_w: 2535 case Intrinsic::x86_avx2_psra_d: 2536 case Intrinsic::x86_avx2_psra_w: 2537 case Intrinsic::x86_avx512_psra_q_128: 2538 case Intrinsic::x86_avx512_psra_q_256: 2539 case Intrinsic::x86_avx512_psra_d_512: 2540 case Intrinsic::x86_avx512_psra_q_512: 2541 case Intrinsic::x86_avx512_psra_w_512: 2542 case Intrinsic::x86_sse2_psrl_d: 2543 case Intrinsic::x86_sse2_psrl_q: 2544 case Intrinsic::x86_sse2_psrl_w: 2545 case Intrinsic::x86_avx2_psrl_d: 2546 case Intrinsic::x86_avx2_psrl_q: 2547 case Intrinsic::x86_avx2_psrl_w: 2548 case Intrinsic::x86_avx512_psrl_d_512: 2549 case Intrinsic::x86_avx512_psrl_q_512: 2550 case Intrinsic::x86_avx512_psrl_w_512: 2551 case Intrinsic::x86_sse2_psll_d: 2552 case Intrinsic::x86_sse2_psll_q: 2553 case Intrinsic::x86_sse2_psll_w: 2554 case Intrinsic::x86_avx2_psll_d: 2555 case Intrinsic::x86_avx2_psll_q: 2556 case Intrinsic::x86_avx2_psll_w: 2557 case Intrinsic::x86_avx512_psll_d_512: 2558 case Intrinsic::x86_avx512_psll_q_512: 2559 case Intrinsic::x86_avx512_psll_w_512: { 2560 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2561 return IC.replaceInstUsesWith(II, V); 2562 } 2563 2564 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2565 // operand to compute the shift amount. 2566 Value *Arg1 = II.getArgOperand(1); 2567 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2568 "Unexpected packed shift size"); 2569 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 2570 2571 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2572 return IC.replaceOperand(II, 1, V); 2573 } 2574 break; 2575 } 2576 2577 case Intrinsic::x86_avx2_psllv_d: 2578 case Intrinsic::x86_avx2_psllv_d_256: 2579 case Intrinsic::x86_avx2_psllv_q: 2580 case Intrinsic::x86_avx2_psllv_q_256: 2581 case Intrinsic::x86_avx512_psllv_d_512: 2582 case Intrinsic::x86_avx512_psllv_q_512: 2583 case Intrinsic::x86_avx512_psllv_w_128: 2584 case Intrinsic::x86_avx512_psllv_w_256: 2585 case Intrinsic::x86_avx512_psllv_w_512: 2586 case Intrinsic::x86_avx2_psrav_d: 2587 case Intrinsic::x86_avx2_psrav_d_256: 2588 case Intrinsic::x86_avx512_psrav_q_128: 2589 case Intrinsic::x86_avx512_psrav_q_256: 2590 case Intrinsic::x86_avx512_psrav_d_512: 2591 case Intrinsic::x86_avx512_psrav_q_512: 2592 case Intrinsic::x86_avx512_psrav_w_128: 2593 case Intrinsic::x86_avx512_psrav_w_256: 2594 case Intrinsic::x86_avx512_psrav_w_512: 2595 case Intrinsic::x86_avx2_psrlv_d: 2596 case Intrinsic::x86_avx2_psrlv_d_256: 2597 case Intrinsic::x86_avx2_psrlv_q: 2598 case Intrinsic::x86_avx2_psrlv_q_256: 2599 case Intrinsic::x86_avx512_psrlv_d_512: 2600 case Intrinsic::x86_avx512_psrlv_q_512: 2601 case Intrinsic::x86_avx512_psrlv_w_128: 2602 case Intrinsic::x86_avx512_psrlv_w_256: 2603 case Intrinsic::x86_avx512_psrlv_w_512: 2604 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 2605 return IC.replaceInstUsesWith(II, V); 2606 } 2607 break; 2608 2609 case Intrinsic::x86_sse2_packssdw_128: 2610 case Intrinsic::x86_sse2_packsswb_128: 2611 case Intrinsic::x86_avx2_packssdw: 2612 case Intrinsic::x86_avx2_packsswb: 2613 case Intrinsic::x86_avx512_packssdw_512: 2614 case Intrinsic::x86_avx512_packsswb_512: 2615 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 2616 return IC.replaceInstUsesWith(II, V); 2617 } 2618 break; 2619 2620 case Intrinsic::x86_sse2_packuswb_128: 2621 case Intrinsic::x86_sse41_packusdw: 2622 case Intrinsic::x86_avx2_packusdw: 2623 case Intrinsic::x86_avx2_packuswb: 2624 case Intrinsic::x86_avx512_packusdw_512: 2625 case Intrinsic::x86_avx512_packuswb_512: 2626 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 2627 return IC.replaceInstUsesWith(II, V); 2628 } 2629 break; 2630 2631 case Intrinsic::x86_sse2_pmulh_w: 2632 case Intrinsic::x86_avx2_pmulh_w: 2633 case Intrinsic::x86_avx512_pmulh_w_512: 2634 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) { 2635 return IC.replaceInstUsesWith(II, V); 2636 } 2637 break; 2638 2639 case Intrinsic::x86_sse2_pmulhu_w: 2640 case Intrinsic::x86_avx2_pmulhu_w: 2641 case Intrinsic::x86_avx512_pmulhu_w_512: 2642 if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) { 2643 return IC.replaceInstUsesWith(II, V); 2644 } 2645 break; 2646 2647 case Intrinsic::x86_ssse3_pmul_hr_sw_128: 2648 case Intrinsic::x86_avx2_pmul_hr_sw: 2649 case Intrinsic::x86_avx512_pmul_hr_sw_512: 2650 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) { 2651 return IC.replaceInstUsesWith(II, V); 2652 } 2653 break; 2654 2655 case Intrinsic::x86_sse2_pmadd_wd: 2656 case Intrinsic::x86_avx2_pmadd_wd: 2657 case Intrinsic::x86_avx512_pmaddw_d_512: 2658 if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) { 2659 return IC.replaceInstUsesWith(II, V); 2660 } 2661 break; 2662 2663 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 2664 case Intrinsic::x86_avx2_pmadd_ub_sw: 2665 case Intrinsic::x86_avx512_pmaddubs_w_512: 2666 if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) { 2667 return IC.replaceInstUsesWith(II, V); 2668 } 2669 break; 2670 2671 case Intrinsic::x86_pclmulqdq: 2672 case Intrinsic::x86_pclmulqdq_256: 2673 case Intrinsic::x86_pclmulqdq_512: { 2674 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2675 unsigned Imm = C->getZExtValue(); 2676 2677 bool MadeChange = false; 2678 Value *Arg0 = II.getArgOperand(0); 2679 Value *Arg1 = II.getArgOperand(1); 2680 unsigned VWidth = 2681 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2682 2683 APInt UndefElts1(VWidth, 0); 2684 APInt DemandedElts1 = 2685 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 2686 if (Value *V = 2687 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 2688 IC.replaceOperand(II, 0, V); 2689 MadeChange = true; 2690 } 2691 2692 APInt UndefElts2(VWidth, 0); 2693 APInt DemandedElts2 = 2694 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 2695 if (Value *V = 2696 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 2697 IC.replaceOperand(II, 1, V); 2698 MadeChange = true; 2699 } 2700 2701 // If either input elements are undef, the result is zero. 2702 if (DemandedElts1.isSubsetOf(UndefElts1) || 2703 DemandedElts2.isSubsetOf(UndefElts2)) { 2704 return IC.replaceInstUsesWith(II, 2705 ConstantAggregateZero::get(II.getType())); 2706 } 2707 2708 if (MadeChange) { 2709 return &II; 2710 } 2711 } 2712 break; 2713 } 2714 2715 case Intrinsic::x86_sse41_insertps: 2716 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 2717 return IC.replaceInstUsesWith(II, V); 2718 } 2719 break; 2720 2721 case Intrinsic::x86_sse4a_extrq: { 2722 Value *Op0 = II.getArgOperand(0); 2723 Value *Op1 = II.getArgOperand(1); 2724 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2725 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2726 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2727 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2728 VWidth1 == 16 && "Unexpected operand sizes"); 2729 2730 // See if we're dealing with constant values. 2731 auto *C1 = dyn_cast<Constant>(Op1); 2732 auto *CILength = 2733 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2734 : nullptr; 2735 auto *CIIndex = 2736 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2737 : nullptr; 2738 2739 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2740 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2741 return IC.replaceInstUsesWith(II, V); 2742 } 2743 2744 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2745 // operands and the lowest 16-bits of the second. 2746 bool MadeChange = false; 2747 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2748 IC.replaceOperand(II, 0, V); 2749 MadeChange = true; 2750 } 2751 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2752 IC.replaceOperand(II, 1, V); 2753 MadeChange = true; 2754 } 2755 if (MadeChange) { 2756 return &II; 2757 } 2758 break; 2759 } 2760 2761 case Intrinsic::x86_sse4a_extrqi: { 2762 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2763 // bits of the lower 64-bits. The upper 64-bits are undefined. 2764 Value *Op0 = II.getArgOperand(0); 2765 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2766 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2767 "Unexpected operand size"); 2768 2769 // See if we're dealing with constant values. 2770 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 2771 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2772 2773 // Attempt to simplify to a constant or shuffle vector. 2774 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2775 return IC.replaceInstUsesWith(II, V); 2776 } 2777 2778 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2779 // operand. 2780 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2781 return IC.replaceOperand(II, 0, V); 2782 } 2783 break; 2784 } 2785 2786 case Intrinsic::x86_sse4a_insertq: { 2787 Value *Op0 = II.getArgOperand(0); 2788 Value *Op1 = II.getArgOperand(1); 2789 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2790 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2791 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2792 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 2793 "Unexpected operand size"); 2794 2795 // See if we're dealing with constant values. 2796 auto *C1 = dyn_cast<Constant>(Op1); 2797 auto *CI11 = 2798 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2799 : nullptr; 2800 2801 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2802 if (CI11) { 2803 const APInt &V11 = CI11->getValue(); 2804 APInt Len = V11.zextOrTrunc(6); 2805 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2806 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2807 return IC.replaceInstUsesWith(II, V); 2808 } 2809 } 2810 2811 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2812 // operand. 2813 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2814 return IC.replaceOperand(II, 0, V); 2815 } 2816 break; 2817 } 2818 2819 case Intrinsic::x86_sse4a_insertqi: { 2820 // INSERTQI: Extract lowest Length bits from lower half of second source and 2821 // insert over first source starting at Index bit. The upper 64-bits are 2822 // undefined. 2823 Value *Op0 = II.getArgOperand(0); 2824 Value *Op1 = II.getArgOperand(1); 2825 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2826 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2827 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2828 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2829 VWidth1 == 2 && "Unexpected operand sizes"); 2830 2831 // See if we're dealing with constant values. 2832 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2833 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 2834 2835 // Attempt to simplify to a constant or shuffle vector. 2836 if (CILength && CIIndex) { 2837 APInt Len = CILength->getValue().zextOrTrunc(6); 2838 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 2839 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2840 return IC.replaceInstUsesWith(II, V); 2841 } 2842 } 2843 2844 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 2845 // operands. 2846 bool MadeChange = false; 2847 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2848 IC.replaceOperand(II, 0, V); 2849 MadeChange = true; 2850 } 2851 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 2852 IC.replaceOperand(II, 1, V); 2853 MadeChange = true; 2854 } 2855 if (MadeChange) { 2856 return &II; 2857 } 2858 break; 2859 } 2860 2861 case Intrinsic::x86_sse41_pblendvb: 2862 case Intrinsic::x86_sse41_blendvps: 2863 case Intrinsic::x86_sse41_blendvpd: 2864 case Intrinsic::x86_avx_blendv_ps_256: 2865 case Intrinsic::x86_avx_blendv_pd_256: 2866 case Intrinsic::x86_avx2_pblendvb: { 2867 // fold (blend A, A, Mask) -> A 2868 Value *Op0 = II.getArgOperand(0); 2869 Value *Op1 = II.getArgOperand(1); 2870 Value *Mask = II.getArgOperand(2); 2871 if (Op0 == Op1) { 2872 return IC.replaceInstUsesWith(II, Op0); 2873 } 2874 2875 // Zero Mask - select 1st argument. 2876 if (isa<ConstantAggregateZero>(Mask)) { 2877 return IC.replaceInstUsesWith(II, Op0); 2878 } 2879 2880 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 2881 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 2882 Constant *NewSelector = 2883 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout()); 2884 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 2885 } 2886 2887 Mask = InstCombiner::peekThroughBitcast(Mask); 2888 2889 // Peek through a one-use shuffle - VectorCombine should have simplified 2890 // this for cases where we're splitting wider vectors to use blendv 2891 // intrinsics. 2892 Value *MaskSrc = nullptr; 2893 ArrayRef<int> ShuffleMask; 2894 if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(), 2895 m_Mask(ShuffleMask))))) { 2896 // Bail if the shuffle was irregular or contains undefs. 2897 int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements(); 2898 if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) || 2899 any_of(ShuffleMask, 2900 [NumElts](int M) { return M < 0 || M >= NumElts; })) 2901 break; 2902 Mask = InstCombiner::peekThroughBitcast(MaskSrc); 2903 } 2904 2905 // Convert to a vector select if we can bypass casts and find a boolean 2906 // vector condition value. 2907 Value *BoolVec; 2908 if (match(Mask, m_SExt(m_Value(BoolVec))) && 2909 BoolVec->getType()->isVectorTy() && 2910 BoolVec->getType()->getScalarSizeInBits() == 1) { 2911 auto *MaskTy = cast<FixedVectorType>(Mask->getType()); 2912 auto *OpTy = cast<FixedVectorType>(II.getType()); 2913 unsigned NumMaskElts = MaskTy->getNumElements(); 2914 unsigned NumOperandElts = OpTy->getNumElements(); 2915 2916 // If we peeked through a shuffle, reapply the shuffle to the bool vector. 2917 if (MaskSrc) { 2918 unsigned NumMaskSrcElts = 2919 cast<FixedVectorType>(MaskSrc->getType())->getNumElements(); 2920 NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts; 2921 // Multiple mask bits maps to the same operand element - bail out. 2922 if (NumMaskElts > NumOperandElts) 2923 break; 2924 SmallVector<int> ScaledMask; 2925 if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask)) 2926 break; 2927 BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask); 2928 MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts); 2929 } 2930 assert(MaskTy->getPrimitiveSizeInBits() == 2931 OpTy->getPrimitiveSizeInBits() && 2932 "Not expecting mask and operands with different sizes"); 2933 2934 if (NumMaskElts == NumOperandElts) { 2935 return SelectInst::Create(BoolVec, Op1, Op0); 2936 } 2937 2938 // If the mask has less elements than the operands, each mask bit maps to 2939 // multiple elements of the operands. Bitcast back and forth. 2940 if (NumMaskElts < NumOperandElts) { 2941 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy); 2942 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy); 2943 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 2944 return new BitCastInst(Sel, II.getType()); 2945 } 2946 } 2947 2948 break; 2949 } 2950 2951 case Intrinsic::x86_ssse3_pshuf_b_128: 2952 case Intrinsic::x86_avx2_pshuf_b: 2953 case Intrinsic::x86_avx512_pshuf_b_512: { 2954 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 2955 return IC.replaceInstUsesWith(II, V); 2956 } 2957 2958 KnownBits KnownMask(8); 2959 if (IC.SimplifyDemandedBits(&II, 1, APInt(8, 0b10001111), KnownMask)) 2960 return &II; 2961 break; 2962 } 2963 2964 case Intrinsic::x86_avx_vpermilvar_ps: 2965 case Intrinsic::x86_avx_vpermilvar_ps_256: 2966 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2967 case Intrinsic::x86_avx_vpermilvar_pd: 2968 case Intrinsic::x86_avx_vpermilvar_pd_256: 2969 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2970 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 2971 return IC.replaceInstUsesWith(II, V); 2972 } 2973 break; 2974 2975 case Intrinsic::x86_avx2_permd: 2976 case Intrinsic::x86_avx2_permps: 2977 case Intrinsic::x86_avx512_permvar_df_256: 2978 case Intrinsic::x86_avx512_permvar_df_512: 2979 case Intrinsic::x86_avx512_permvar_di_256: 2980 case Intrinsic::x86_avx512_permvar_di_512: 2981 case Intrinsic::x86_avx512_permvar_hi_128: 2982 case Intrinsic::x86_avx512_permvar_hi_256: 2983 case Intrinsic::x86_avx512_permvar_hi_512: 2984 case Intrinsic::x86_avx512_permvar_qi_128: 2985 case Intrinsic::x86_avx512_permvar_qi_256: 2986 case Intrinsic::x86_avx512_permvar_qi_512: 2987 case Intrinsic::x86_avx512_permvar_sf_512: 2988 case Intrinsic::x86_avx512_permvar_si_512: 2989 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 2990 return IC.replaceInstUsesWith(II, V); 2991 } 2992 break; 2993 2994 case Intrinsic::x86_avx512_vpermi2var_d_128: 2995 case Intrinsic::x86_avx512_vpermi2var_d_256: 2996 case Intrinsic::x86_avx512_vpermi2var_d_512: 2997 case Intrinsic::x86_avx512_vpermi2var_hi_128: 2998 case Intrinsic::x86_avx512_vpermi2var_hi_256: 2999 case Intrinsic::x86_avx512_vpermi2var_hi_512: 3000 case Intrinsic::x86_avx512_vpermi2var_pd_128: 3001 case Intrinsic::x86_avx512_vpermi2var_pd_256: 3002 case Intrinsic::x86_avx512_vpermi2var_pd_512: 3003 case Intrinsic::x86_avx512_vpermi2var_ps_128: 3004 case Intrinsic::x86_avx512_vpermi2var_ps_256: 3005 case Intrinsic::x86_avx512_vpermi2var_ps_512: 3006 case Intrinsic::x86_avx512_vpermi2var_q_128: 3007 case Intrinsic::x86_avx512_vpermi2var_q_256: 3008 case Intrinsic::x86_avx512_vpermi2var_q_512: 3009 case Intrinsic::x86_avx512_vpermi2var_qi_128: 3010 case Intrinsic::x86_avx512_vpermi2var_qi_256: 3011 case Intrinsic::x86_avx512_vpermi2var_qi_512: 3012 if (Value *V = simplifyX86vpermv3(II, IC.Builder)) { 3013 return IC.replaceInstUsesWith(II, V); 3014 } 3015 break; 3016 3017 case Intrinsic::x86_avx_maskload_ps: 3018 case Intrinsic::x86_avx_maskload_pd: 3019 case Intrinsic::x86_avx_maskload_ps_256: 3020 case Intrinsic::x86_avx_maskload_pd_256: 3021 case Intrinsic::x86_avx2_maskload_d: 3022 case Intrinsic::x86_avx2_maskload_q: 3023 case Intrinsic::x86_avx2_maskload_d_256: 3024 case Intrinsic::x86_avx2_maskload_q_256: 3025 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 3026 return I; 3027 } 3028 break; 3029 3030 case Intrinsic::x86_sse2_maskmov_dqu: 3031 case Intrinsic::x86_avx_maskstore_ps: 3032 case Intrinsic::x86_avx_maskstore_pd: 3033 case Intrinsic::x86_avx_maskstore_ps_256: 3034 case Intrinsic::x86_avx_maskstore_pd_256: 3035 case Intrinsic::x86_avx2_maskstore_d: 3036 case Intrinsic::x86_avx2_maskstore_q: 3037 case Intrinsic::x86_avx2_maskstore_d_256: 3038 case Intrinsic::x86_avx2_maskstore_q_256: 3039 if (simplifyX86MaskedStore(II, IC)) { 3040 return nullptr; 3041 } 3042 break; 3043 3044 case Intrinsic::x86_addcarry_32: 3045 case Intrinsic::x86_addcarry_64: 3046 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 3047 return IC.replaceInstUsesWith(II, V); 3048 } 3049 break; 3050 3051 case Intrinsic::x86_avx512_pternlog_d_128: 3052 case Intrinsic::x86_avx512_pternlog_d_256: 3053 case Intrinsic::x86_avx512_pternlog_d_512: 3054 case Intrinsic::x86_avx512_pternlog_q_128: 3055 case Intrinsic::x86_avx512_pternlog_q_256: 3056 case Intrinsic::x86_avx512_pternlog_q_512: 3057 if (Value *V = simplifyTernarylogic(II, IC.Builder)) { 3058 return IC.replaceInstUsesWith(II, V); 3059 } 3060 break; 3061 default: 3062 break; 3063 } 3064 return std::nullopt; 3065 } 3066 3067 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 3068 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 3069 bool &KnownBitsComputed) const { 3070 switch (II.getIntrinsicID()) { 3071 default: 3072 break; 3073 case Intrinsic::x86_mmx_pmovmskb: 3074 case Intrinsic::x86_sse_movmsk_ps: 3075 case Intrinsic::x86_sse2_movmsk_pd: 3076 case Intrinsic::x86_sse2_pmovmskb_128: 3077 case Intrinsic::x86_avx_movmsk_ps_256: 3078 case Intrinsic::x86_avx_movmsk_pd_256: 3079 case Intrinsic::x86_avx2_pmovmskb: { 3080 // MOVMSK copies the vector elements' sign bits to the low bits 3081 // and zeros the high bits. 3082 unsigned ArgWidth; 3083 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 3084 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 3085 } else { 3086 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 3087 ArgWidth = ArgType->getNumElements(); 3088 } 3089 3090 // If we don't need any of low bits then return zero, 3091 // we know that DemandedMask is non-zero already. 3092 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 3093 Type *VTy = II.getType(); 3094 if (DemandedElts.isZero()) { 3095 return ConstantInt::getNullValue(VTy); 3096 } 3097 3098 // We know that the upper bits are set to zero. 3099 Known.Zero.setBitsFrom(ArgWidth); 3100 KnownBitsComputed = true; 3101 break; 3102 } 3103 } 3104 return std::nullopt; 3105 } 3106 3107 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 3108 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 3109 APInt &UndefElts2, APInt &UndefElts3, 3110 std::function<void(Instruction *, unsigned, APInt, APInt &)> 3111 simplifyAndSetOp) const { 3112 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 3113 switch (II.getIntrinsicID()) { 3114 default: 3115 break; 3116 case Intrinsic::x86_xop_vfrcz_ss: 3117 case Intrinsic::x86_xop_vfrcz_sd: 3118 // The instructions for these intrinsics are speced to zero upper bits not 3119 // pass them through like other scalar intrinsics. So we shouldn't just 3120 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 3121 // Instead we should return a zero vector. 3122 if (!DemandedElts[0]) { 3123 IC.addToWorklist(&II); 3124 return ConstantAggregateZero::get(II.getType()); 3125 } 3126 3127 // Only the lower element is used. 3128 DemandedElts = 1; 3129 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3130 3131 // Only the lower element is undefined. The high elements are zero. 3132 UndefElts = UndefElts[0]; 3133 break; 3134 3135 // Unary scalar-as-vector operations that work column-wise. 3136 case Intrinsic::x86_sse_rcp_ss: 3137 case Intrinsic::x86_sse_rsqrt_ss: 3138 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3139 3140 // If lowest element of a scalar op isn't used then use Arg0. 3141 if (!DemandedElts[0]) { 3142 IC.addToWorklist(&II); 3143 return II.getArgOperand(0); 3144 } 3145 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 3146 // checks). 3147 break; 3148 3149 // Binary scalar-as-vector operations that work column-wise. The high 3150 // elements come from operand 0. The low element is a function of both 3151 // operands. 3152 case Intrinsic::x86_sse_min_ss: 3153 case Intrinsic::x86_sse_max_ss: 3154 case Intrinsic::x86_sse_cmp_ss: 3155 case Intrinsic::x86_sse2_min_sd: 3156 case Intrinsic::x86_sse2_max_sd: 3157 case Intrinsic::x86_sse2_cmp_sd: { 3158 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3159 3160 // If lowest element of a scalar op isn't used then use Arg0. 3161 if (!DemandedElts[0]) { 3162 IC.addToWorklist(&II); 3163 return II.getArgOperand(0); 3164 } 3165 3166 // Only lower element is used for operand 1. 3167 DemandedElts = 1; 3168 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3169 3170 // Lower element is undefined if both lower elements are undefined. 3171 // Consider things like undef&0. The result is known zero, not undef. 3172 if (!UndefElts2[0]) 3173 UndefElts.clearBit(0); 3174 3175 break; 3176 } 3177 3178 // Binary scalar-as-vector operations that work column-wise. The high 3179 // elements come from operand 0 and the low element comes from operand 1. 3180 case Intrinsic::x86_sse41_round_ss: 3181 case Intrinsic::x86_sse41_round_sd: { 3182 // Don't use the low element of operand 0. 3183 APInt DemandedElts2 = DemandedElts; 3184 DemandedElts2.clearBit(0); 3185 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 3186 3187 // If lowest element of a scalar op isn't used then use Arg0. 3188 if (!DemandedElts[0]) { 3189 IC.addToWorklist(&II); 3190 return II.getArgOperand(0); 3191 } 3192 3193 // Only lower element is used for operand 1. 3194 DemandedElts = 1; 3195 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3196 3197 // Take the high undef elements from operand 0 and take the lower element 3198 // from operand 1. 3199 UndefElts.clearBit(0); 3200 UndefElts |= UndefElts2[0]; 3201 break; 3202 } 3203 3204 // Three input scalar-as-vector operations that work column-wise. The high 3205 // elements come from operand 0 and the low element is a function of all 3206 // three inputs. 3207 case Intrinsic::x86_avx512_mask_add_ss_round: 3208 case Intrinsic::x86_avx512_mask_div_ss_round: 3209 case Intrinsic::x86_avx512_mask_mul_ss_round: 3210 case Intrinsic::x86_avx512_mask_sub_ss_round: 3211 case Intrinsic::x86_avx512_mask_max_ss_round: 3212 case Intrinsic::x86_avx512_mask_min_ss_round: 3213 case Intrinsic::x86_avx512_mask_add_sd_round: 3214 case Intrinsic::x86_avx512_mask_div_sd_round: 3215 case Intrinsic::x86_avx512_mask_mul_sd_round: 3216 case Intrinsic::x86_avx512_mask_sub_sd_round: 3217 case Intrinsic::x86_avx512_mask_max_sd_round: 3218 case Intrinsic::x86_avx512_mask_min_sd_round: 3219 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3220 3221 // If lowest element of a scalar op isn't used then use Arg0. 3222 if (!DemandedElts[0]) { 3223 IC.addToWorklist(&II); 3224 return II.getArgOperand(0); 3225 } 3226 3227 // Only lower element is used for operand 1 and 2. 3228 DemandedElts = 1; 3229 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3230 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 3231 3232 // Lower element is undefined if all three lower elements are undefined. 3233 // Consider things like undef&0. The result is known zero, not undef. 3234 if (!UndefElts2[0] || !UndefElts3[0]) 3235 UndefElts.clearBit(0); 3236 break; 3237 3238 // TODO: Add fmaddsub support? 3239 case Intrinsic::x86_sse3_addsub_pd: 3240 case Intrinsic::x86_sse3_addsub_ps: 3241 case Intrinsic::x86_avx_addsub_pd_256: 3242 case Intrinsic::x86_avx_addsub_ps_256: { 3243 // If none of the even or none of the odd lanes are required, turn this 3244 // into a generic FP math instruction. 3245 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 3246 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 3247 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 3248 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 3249 if (IsSubOnly || IsAddOnly) { 3250 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 3251 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 3252 IC.Builder.SetInsertPoint(&II); 3253 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 3254 return IC.Builder.CreateBinOp( 3255 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 3256 } 3257 3258 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3259 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3260 UndefElts &= UndefElts2; 3261 break; 3262 } 3263 3264 // General per-element vector operations. 3265 case Intrinsic::x86_avx2_psllv_d: 3266 case Intrinsic::x86_avx2_psllv_d_256: 3267 case Intrinsic::x86_avx2_psllv_q: 3268 case Intrinsic::x86_avx2_psllv_q_256: 3269 case Intrinsic::x86_avx2_psrlv_d: 3270 case Intrinsic::x86_avx2_psrlv_d_256: 3271 case Intrinsic::x86_avx2_psrlv_q: 3272 case Intrinsic::x86_avx2_psrlv_q_256: 3273 case Intrinsic::x86_avx2_psrav_d: 3274 case Intrinsic::x86_avx2_psrav_d_256: { 3275 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3276 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3277 UndefElts &= UndefElts2; 3278 break; 3279 } 3280 3281 case Intrinsic::x86_sse2_pmulh_w: 3282 case Intrinsic::x86_avx2_pmulh_w: 3283 case Intrinsic::x86_avx512_pmulh_w_512: 3284 case Intrinsic::x86_sse2_pmulhu_w: 3285 case Intrinsic::x86_avx2_pmulhu_w: 3286 case Intrinsic::x86_avx512_pmulhu_w_512: 3287 case Intrinsic::x86_ssse3_pmul_hr_sw_128: 3288 case Intrinsic::x86_avx2_pmul_hr_sw: 3289 case Intrinsic::x86_avx512_pmul_hr_sw_512: { 3290 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3291 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3292 // NOTE: mulh(undef,undef) != undef. 3293 break; 3294 } 3295 3296 case Intrinsic::x86_sse2_packssdw_128: 3297 case Intrinsic::x86_sse2_packsswb_128: 3298 case Intrinsic::x86_sse2_packuswb_128: 3299 case Intrinsic::x86_sse41_packusdw: 3300 case Intrinsic::x86_avx2_packssdw: 3301 case Intrinsic::x86_avx2_packsswb: 3302 case Intrinsic::x86_avx2_packusdw: 3303 case Intrinsic::x86_avx2_packuswb: 3304 case Intrinsic::x86_avx512_packssdw_512: 3305 case Intrinsic::x86_avx512_packsswb_512: 3306 case Intrinsic::x86_avx512_packusdw_512: 3307 case Intrinsic::x86_avx512_packuswb_512: { 3308 auto *Ty0 = II.getArgOperand(0)->getType(); 3309 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 3310 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 3311 3312 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 3313 unsigned VWidthPerLane = VWidth / NumLanes; 3314 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 3315 3316 // Per lane, pack the elements of the first input and then the second. 3317 // e.g. 3318 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 3319 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 3320 for (int OpNum = 0; OpNum != 2; ++OpNum) { 3321 APInt OpDemandedElts(InnerVWidth, 0); 3322 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3323 unsigned LaneIdx = Lane * VWidthPerLane; 3324 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 3325 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 3326 if (DemandedElts[Idx]) 3327 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 3328 } 3329 } 3330 3331 // Demand elements from the operand. 3332 APInt OpUndefElts(InnerVWidth, 0); 3333 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 3334 3335 // Pack the operand's UNDEF elements, one lane at a time. 3336 OpUndefElts = OpUndefElts.zext(VWidth); 3337 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3338 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 3339 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 3340 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 3341 UndefElts |= LaneElts; 3342 } 3343 } 3344 break; 3345 } 3346 3347 case Intrinsic::x86_sse2_pmadd_wd: 3348 case Intrinsic::x86_avx2_pmadd_wd: 3349 case Intrinsic::x86_avx512_pmaddw_d_512: 3350 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 3351 case Intrinsic::x86_avx2_pmadd_ub_sw: 3352 case Intrinsic::x86_avx512_pmaddubs_w_512: { 3353 // PMADD - demand both src elements that map to each dst element. 3354 auto *ArgTy = II.getArgOperand(0)->getType(); 3355 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements(); 3356 assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); 3357 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth); 3358 APInt Op0UndefElts(InnerVWidth, 0); 3359 APInt Op1UndefElts(InnerVWidth, 0); 3360 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts); 3361 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts); 3362 // NOTE: madd(undef,undef) != undef. 3363 break; 3364 } 3365 3366 // PSHUFB 3367 case Intrinsic::x86_ssse3_pshuf_b_128: 3368 case Intrinsic::x86_avx2_pshuf_b: 3369 case Intrinsic::x86_avx512_pshuf_b_512: 3370 // PERMILVAR 3371 case Intrinsic::x86_avx_vpermilvar_ps: 3372 case Intrinsic::x86_avx_vpermilvar_ps_256: 3373 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3374 case Intrinsic::x86_avx_vpermilvar_pd: 3375 case Intrinsic::x86_avx_vpermilvar_pd_256: 3376 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3377 // PERMV 3378 case Intrinsic::x86_avx2_permd: 3379 case Intrinsic::x86_avx2_permps: { 3380 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 3381 break; 3382 } 3383 3384 // SSE4A instructions leave the upper 64-bits of the 128-bit result 3385 // in an undefined state. 3386 case Intrinsic::x86_sse4a_extrq: 3387 case Intrinsic::x86_sse4a_extrqi: 3388 case Intrinsic::x86_sse4a_insertq: 3389 case Intrinsic::x86_sse4a_insertqi: 3390 UndefElts.setHighBits(VWidth / 2); 3391 break; 3392 } 3393 return std::nullopt; 3394 } 3395