1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 #include <optional> 22 23 using namespace llvm; 24 using namespace llvm::PatternMatch; 25 26 #define DEBUG_TYPE "x86tti" 27 28 /// Return a constant boolean vector that has true elements in all positions 29 /// where the input constant data vector has an element with the sign bit set. 30 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) { 31 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 32 V = ConstantExpr::getBitCast(V, IntTy); 33 V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT, 34 Constant::getNullValue(IntTy), V, DL); 35 assert(V && "Vector must be foldable"); 36 return V; 37 } 38 39 /// Convert the x86 XMM integer vector mask to a vector of bools based on 40 /// each element's most significant bit (the sign bit). 41 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) { 42 // Fold Constant Mask. 43 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 44 return getNegativeIsTrueBoolVec(ConstantMask, DL); 45 46 // Mask was extended from a boolean vector. 47 Value *ExtMask; 48 if (match(Mask, m_SExt(m_Value(ExtMask))) && 49 ExtMask->getType()->isIntOrIntVectorTy(1)) 50 return ExtMask; 51 52 return nullptr; 53 } 54 55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 56 // XMM register mask efficiently, we could transform all x86 masked intrinsics 57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 59 Value *Ptr = II.getOperand(0); 60 Value *Mask = II.getOperand(1); 61 Constant *ZeroVec = Constant::getNullValue(II.getType()); 62 63 // Zero Mask - masked load instruction creates a zero vector. 64 if (isa<ConstantAggregateZero>(Mask)) 65 return IC.replaceInstUsesWith(II, ZeroVec); 66 67 // The mask is constant or extended from a bool vector. Convert this x86 68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 70 // The pass-through vector for an x86 masked load is a zero vector. 71 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 72 II.getType(), Ptr, Align(1), BoolMask, ZeroVec); 73 return IC.replaceInstUsesWith(II, NewMaskedLoad); 74 } 75 76 return nullptr; 77 } 78 79 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 80 // XMM register mask efficiently, we could transform all x86 masked intrinsics 81 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 82 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 83 Value *Ptr = II.getOperand(0); 84 Value *Mask = II.getOperand(1); 85 Value *Vec = II.getOperand(2); 86 87 // Zero Mask - this masked store instruction does nothing. 88 if (isa<ConstantAggregateZero>(Mask)) { 89 IC.eraseInstFromFunction(II); 90 return true; 91 } 92 93 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 94 // anything else at this level. 95 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 96 return false; 97 98 // The mask is constant or extended from a bool vector. Convert this x86 99 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 100 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 101 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 102 PointerType *VecPtrTy = PointerType::get(Vec->getContext(), AddrSpace); 103 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 104 105 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 106 107 // 'Replace uses' doesn't work for stores. Erase the original masked store. 108 IC.eraseInstFromFunction(II); 109 return true; 110 } 111 112 return false; 113 } 114 115 static Value *simplifyX86immShift(const IntrinsicInst &II, 116 InstCombiner::BuilderTy &Builder) { 117 bool LogicalShift = false; 118 bool ShiftLeft = false; 119 bool IsImm = false; 120 121 switch (II.getIntrinsicID()) { 122 default: 123 llvm_unreachable("Unexpected intrinsic!"); 124 case Intrinsic::x86_sse2_psrai_d: 125 case Intrinsic::x86_sse2_psrai_w: 126 case Intrinsic::x86_avx2_psrai_d: 127 case Intrinsic::x86_avx2_psrai_w: 128 case Intrinsic::x86_avx512_psrai_q_128: 129 case Intrinsic::x86_avx512_psrai_q_256: 130 case Intrinsic::x86_avx512_psrai_d_512: 131 case Intrinsic::x86_avx512_psrai_q_512: 132 case Intrinsic::x86_avx512_psrai_w_512: 133 IsImm = true; 134 [[fallthrough]]; 135 case Intrinsic::x86_sse2_psra_d: 136 case Intrinsic::x86_sse2_psra_w: 137 case Intrinsic::x86_avx2_psra_d: 138 case Intrinsic::x86_avx2_psra_w: 139 case Intrinsic::x86_avx512_psra_q_128: 140 case Intrinsic::x86_avx512_psra_q_256: 141 case Intrinsic::x86_avx512_psra_d_512: 142 case Intrinsic::x86_avx512_psra_q_512: 143 case Intrinsic::x86_avx512_psra_w_512: 144 LogicalShift = false; 145 ShiftLeft = false; 146 break; 147 case Intrinsic::x86_sse2_psrli_d: 148 case Intrinsic::x86_sse2_psrli_q: 149 case Intrinsic::x86_sse2_psrli_w: 150 case Intrinsic::x86_avx2_psrli_d: 151 case Intrinsic::x86_avx2_psrli_q: 152 case Intrinsic::x86_avx2_psrli_w: 153 case Intrinsic::x86_avx512_psrli_d_512: 154 case Intrinsic::x86_avx512_psrli_q_512: 155 case Intrinsic::x86_avx512_psrli_w_512: 156 IsImm = true; 157 [[fallthrough]]; 158 case Intrinsic::x86_sse2_psrl_d: 159 case Intrinsic::x86_sse2_psrl_q: 160 case Intrinsic::x86_sse2_psrl_w: 161 case Intrinsic::x86_avx2_psrl_d: 162 case Intrinsic::x86_avx2_psrl_q: 163 case Intrinsic::x86_avx2_psrl_w: 164 case Intrinsic::x86_avx512_psrl_d_512: 165 case Intrinsic::x86_avx512_psrl_q_512: 166 case Intrinsic::x86_avx512_psrl_w_512: 167 LogicalShift = true; 168 ShiftLeft = false; 169 break; 170 case Intrinsic::x86_sse2_pslli_d: 171 case Intrinsic::x86_sse2_pslli_q: 172 case Intrinsic::x86_sse2_pslli_w: 173 case Intrinsic::x86_avx2_pslli_d: 174 case Intrinsic::x86_avx2_pslli_q: 175 case Intrinsic::x86_avx2_pslli_w: 176 case Intrinsic::x86_avx512_pslli_d_512: 177 case Intrinsic::x86_avx512_pslli_q_512: 178 case Intrinsic::x86_avx512_pslli_w_512: 179 IsImm = true; 180 [[fallthrough]]; 181 case Intrinsic::x86_sse2_psll_d: 182 case Intrinsic::x86_sse2_psll_q: 183 case Intrinsic::x86_sse2_psll_w: 184 case Intrinsic::x86_avx2_psll_d: 185 case Intrinsic::x86_avx2_psll_q: 186 case Intrinsic::x86_avx2_psll_w: 187 case Intrinsic::x86_avx512_psll_d_512: 188 case Intrinsic::x86_avx512_psll_q_512: 189 case Intrinsic::x86_avx512_psll_w_512: 190 LogicalShift = true; 191 ShiftLeft = true; 192 break; 193 } 194 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 195 196 Value *Vec = II.getArgOperand(0); 197 Value *Amt = II.getArgOperand(1); 198 auto *VT = cast<FixedVectorType>(Vec->getType()); 199 Type *SVT = VT->getElementType(); 200 Type *AmtVT = Amt->getType(); 201 unsigned VWidth = VT->getNumElements(); 202 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 203 204 // If the shift amount is guaranteed to be in-range we can replace it with a 205 // generic shift. If its guaranteed to be out of range, logical shifts combine 206 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 207 if (IsImm) { 208 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 209 KnownBits KnownAmtBits = 210 llvm::computeKnownBits(Amt, II.getDataLayout()); 211 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 212 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 213 Amt = Builder.CreateVectorSplat(VWidth, Amt); 214 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 215 : Builder.CreateLShr(Vec, Amt)) 216 : Builder.CreateAShr(Vec, Amt)); 217 } 218 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 219 if (LogicalShift) 220 return ConstantAggregateZero::get(VT); 221 Amt = ConstantInt::get(SVT, BitWidth - 1); 222 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 223 } 224 } else { 225 // Ensure the first element has an in-range value and the rest of the 226 // elements in the bottom 64 bits are zero. 227 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 228 cast<VectorType>(AmtVT)->getElementType() == SVT && 229 "Unexpected shift-by-scalar type"); 230 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 231 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 232 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 233 KnownBits KnownLowerBits = llvm::computeKnownBits( 234 Amt, DemandedLower, II.getDataLayout()); 235 KnownBits KnownUpperBits = llvm::computeKnownBits( 236 Amt, DemandedUpper, II.getDataLayout()); 237 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 238 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 239 SmallVector<int, 16> ZeroSplat(VWidth, 0); 240 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 241 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 242 : Builder.CreateLShr(Vec, Amt)) 243 : Builder.CreateAShr(Vec, Amt)); 244 } 245 } 246 247 // Simplify if count is constant vector. 248 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 249 if (!CDV) 250 return nullptr; 251 252 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 253 // operand to compute the shift amount. 254 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 255 cast<VectorType>(AmtVT)->getElementType() == SVT && 256 "Unexpected shift-by-scalar type"); 257 258 // Concatenate the sub-elements to create the 64-bit value. 259 APInt Count(64, 0); 260 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 261 unsigned SubEltIdx = (NumSubElts - 1) - i; 262 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 263 Count <<= BitWidth; 264 Count |= SubElt->getValue().zextOrTrunc(64); 265 } 266 267 // If shift-by-zero then just return the original value. 268 if (Count.isZero()) 269 return Vec; 270 271 // Handle cases when Shift >= BitWidth. 272 if (Count.uge(BitWidth)) { 273 // If LogicalShift - just return zero. 274 if (LogicalShift) 275 return ConstantAggregateZero::get(VT); 276 277 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 278 Count = APInt(64, BitWidth - 1); 279 } 280 281 // Get a constant vector of the same type as the first operand. 282 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 283 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 284 285 if (ShiftLeft) 286 return Builder.CreateShl(Vec, ShiftVec); 287 288 if (LogicalShift) 289 return Builder.CreateLShr(Vec, ShiftVec); 290 291 return Builder.CreateAShr(Vec, ShiftVec); 292 } 293 294 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 295 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 296 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 297 static Value *simplifyX86varShift(const IntrinsicInst &II, 298 InstCombiner::BuilderTy &Builder) { 299 bool LogicalShift = false; 300 bool ShiftLeft = false; 301 302 switch (II.getIntrinsicID()) { 303 default: 304 llvm_unreachable("Unexpected intrinsic!"); 305 case Intrinsic::x86_avx2_psrav_d: 306 case Intrinsic::x86_avx2_psrav_d_256: 307 case Intrinsic::x86_avx512_psrav_q_128: 308 case Intrinsic::x86_avx512_psrav_q_256: 309 case Intrinsic::x86_avx512_psrav_d_512: 310 case Intrinsic::x86_avx512_psrav_q_512: 311 case Intrinsic::x86_avx512_psrav_w_128: 312 case Intrinsic::x86_avx512_psrav_w_256: 313 case Intrinsic::x86_avx512_psrav_w_512: 314 LogicalShift = false; 315 ShiftLeft = false; 316 break; 317 case Intrinsic::x86_avx2_psrlv_d: 318 case Intrinsic::x86_avx2_psrlv_d_256: 319 case Intrinsic::x86_avx2_psrlv_q: 320 case Intrinsic::x86_avx2_psrlv_q_256: 321 case Intrinsic::x86_avx512_psrlv_d_512: 322 case Intrinsic::x86_avx512_psrlv_q_512: 323 case Intrinsic::x86_avx512_psrlv_w_128: 324 case Intrinsic::x86_avx512_psrlv_w_256: 325 case Intrinsic::x86_avx512_psrlv_w_512: 326 LogicalShift = true; 327 ShiftLeft = false; 328 break; 329 case Intrinsic::x86_avx2_psllv_d: 330 case Intrinsic::x86_avx2_psllv_d_256: 331 case Intrinsic::x86_avx2_psllv_q: 332 case Intrinsic::x86_avx2_psllv_q_256: 333 case Intrinsic::x86_avx512_psllv_d_512: 334 case Intrinsic::x86_avx512_psllv_q_512: 335 case Intrinsic::x86_avx512_psllv_w_128: 336 case Intrinsic::x86_avx512_psllv_w_256: 337 case Intrinsic::x86_avx512_psllv_w_512: 338 LogicalShift = true; 339 ShiftLeft = true; 340 break; 341 } 342 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 343 344 Value *Vec = II.getArgOperand(0); 345 Value *Amt = II.getArgOperand(1); 346 auto *VT = cast<FixedVectorType>(II.getType()); 347 Type *SVT = VT->getElementType(); 348 int NumElts = VT->getNumElements(); 349 int BitWidth = SVT->getIntegerBitWidth(); 350 351 // If the shift amount is guaranteed to be in-range we can replace it with a 352 // generic shift. 353 KnownBits KnownAmt = 354 llvm::computeKnownBits(Amt, II.getDataLayout()); 355 if (KnownAmt.getMaxValue().ult(BitWidth)) { 356 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 357 : Builder.CreateLShr(Vec, Amt)) 358 : Builder.CreateAShr(Vec, Amt)); 359 } 360 361 // Simplify if all shift amounts are constant/undef. 362 auto *CShift = dyn_cast<Constant>(Amt); 363 if (!CShift) 364 return nullptr; 365 366 // Collect each element's shift amount. 367 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 368 bool AnyOutOfRange = false; 369 SmallVector<int, 8> ShiftAmts; 370 for (int I = 0; I < NumElts; ++I) { 371 auto *CElt = CShift->getAggregateElement(I); 372 if (isa_and_nonnull<UndefValue>(CElt)) { 373 ShiftAmts.push_back(-1); 374 continue; 375 } 376 377 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 378 if (!COp) 379 return nullptr; 380 381 // Handle out of range shifts. 382 // If LogicalShift - set to BitWidth (special case). 383 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 384 APInt ShiftVal = COp->getValue(); 385 if (ShiftVal.uge(BitWidth)) { 386 AnyOutOfRange = LogicalShift; 387 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 388 continue; 389 } 390 391 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 392 } 393 394 // If all elements out of range or UNDEF, return vector of zeros/undefs. 395 // ArithmeticShift should only hit this if they are all UNDEF. 396 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 397 if (llvm::all_of(ShiftAmts, OutOfRange)) { 398 SmallVector<Constant *, 8> ConstantVec; 399 for (int Idx : ShiftAmts) { 400 if (Idx < 0) { 401 ConstantVec.push_back(UndefValue::get(SVT)); 402 } else { 403 assert(LogicalShift && "Logical shift expected"); 404 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 405 } 406 } 407 return ConstantVector::get(ConstantVec); 408 } 409 410 // We can't handle only some out of range values with generic logical shifts. 411 if (AnyOutOfRange) 412 return nullptr; 413 414 // Build the shift amount constant vector. 415 SmallVector<Constant *, 8> ShiftVecAmts; 416 for (int Idx : ShiftAmts) { 417 if (Idx < 0) 418 ShiftVecAmts.push_back(UndefValue::get(SVT)); 419 else 420 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 421 } 422 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 423 424 if (ShiftLeft) 425 return Builder.CreateShl(Vec, ShiftVec); 426 427 if (LogicalShift) 428 return Builder.CreateLShr(Vec, ShiftVec); 429 430 return Builder.CreateAShr(Vec, ShiftVec); 431 } 432 433 static Value *simplifyX86pack(IntrinsicInst &II, 434 InstCombiner::BuilderTy &Builder, bool IsSigned) { 435 Value *Arg0 = II.getArgOperand(0); 436 Value *Arg1 = II.getArgOperand(1); 437 Type *ResTy = II.getType(); 438 439 // Fast all undef handling. 440 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 441 return UndefValue::get(ResTy); 442 443 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 444 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 445 unsigned NumSrcElts = ArgTy->getNumElements(); 446 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 447 "Unexpected packing types"); 448 449 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 450 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 451 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 452 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 453 "Unexpected packing types"); 454 455 // Constant folding. 456 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 457 return nullptr; 458 459 // Clamp Values - signed/unsigned both use signed clamp values, but they 460 // differ on the min/max values. 461 APInt MinValue, MaxValue; 462 if (IsSigned) { 463 // PACKSS: Truncate signed value with signed saturation. 464 // Source values less than dst minint are saturated to minint. 465 // Source values greater than dst maxint are saturated to maxint. 466 MinValue = 467 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 468 MaxValue = 469 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 470 } else { 471 // PACKUS: Truncate signed value with unsigned saturation. 472 // Source values less than zero are saturated to zero. 473 // Source values greater than dst maxuint are saturated to maxuint. 474 MinValue = APInt::getZero(SrcScalarSizeInBits); 475 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 476 } 477 478 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 479 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 480 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 481 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 482 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 483 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 484 485 // Shuffle clamped args together at the lane level. 486 SmallVector<int, 32> PackMask; 487 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 488 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 489 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 490 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 491 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 492 } 493 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 494 495 // Truncate to dst size. 496 return Builder.CreateTrunc(Shuffle, ResTy); 497 } 498 499 static Value *simplifyX86pmulh(IntrinsicInst &II, 500 InstCombiner::BuilderTy &Builder, bool IsSigned, 501 bool IsRounding) { 502 Value *Arg0 = II.getArgOperand(0); 503 Value *Arg1 = II.getArgOperand(1); 504 auto *ResTy = cast<FixedVectorType>(II.getType()); 505 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 506 assert(ArgTy == ResTy && ResTy->getScalarSizeInBits() == 16 && 507 "Unexpected PMULH types"); 508 assert((!IsRounding || IsSigned) && "PMULHRS instruction must be signed"); 509 510 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero. 511 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) 512 return ConstantAggregateZero::get(ResTy); 513 514 // Multiply by zero. 515 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) 516 return ConstantAggregateZero::get(ResTy); 517 518 // Multiply by one. 519 if (!IsRounding) { 520 if (match(Arg0, m_One())) 521 return IsSigned ? Builder.CreateAShr(Arg1, 15) 522 : ConstantAggregateZero::get(ResTy); 523 if (match(Arg1, m_One())) 524 return IsSigned ? Builder.CreateAShr(Arg0, 15) 525 : ConstantAggregateZero::get(ResTy); 526 } 527 528 // Constant folding. 529 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 530 return nullptr; 531 532 // Extend to twice the width and multiply. 533 auto Cast = 534 IsSigned ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt; 535 auto *ExtTy = FixedVectorType::getExtendedElementVectorType(ArgTy); 536 Value *LHS = Builder.CreateCast(Cast, Arg0, ExtTy); 537 Value *RHS = Builder.CreateCast(Cast, Arg1, ExtTy); 538 Value *Mul = Builder.CreateMul(LHS, RHS); 539 540 if (IsRounding) { 541 // PMULHRSW: truncate to vXi18 of the most significant bits, add one and 542 // extract bits[16:1]. 543 auto *RndEltTy = IntegerType::get(ExtTy->getContext(), 18); 544 auto *RndTy = FixedVectorType::get(RndEltTy, ExtTy); 545 Mul = Builder.CreateLShr(Mul, 14); 546 Mul = Builder.CreateTrunc(Mul, RndTy); 547 Mul = Builder.CreateAdd(Mul, ConstantInt::get(RndTy, 1)); 548 Mul = Builder.CreateLShr(Mul, 1); 549 } else { 550 // PMULH/PMULHU: extract the vXi16 most significant bits. 551 Mul = Builder.CreateLShr(Mul, 16); 552 } 553 554 return Builder.CreateTrunc(Mul, ResTy); 555 } 556 557 static Value *simplifyX86pmadd(IntrinsicInst &II, 558 InstCombiner::BuilderTy &Builder, 559 bool IsPMADDWD) { 560 Value *Arg0 = II.getArgOperand(0); 561 Value *Arg1 = II.getArgOperand(1); 562 auto *ResTy = cast<FixedVectorType>(II.getType()); 563 [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 564 565 unsigned NumDstElts = ResTy->getNumElements(); 566 assert(ArgTy->getNumElements() == (2 * NumDstElts) && 567 ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) && 568 "Unexpected PMADD types"); 569 570 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero. 571 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) 572 return ConstantAggregateZero::get(ResTy); 573 574 // Multiply by zero. 575 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) 576 return ConstantAggregateZero::get(ResTy); 577 578 // Constant folding. 579 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 580 return nullptr; 581 582 // Split Lo/Hi elements pairs, extend and add together. 583 // PMADDWD(X,Y) = 584 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1]))) 585 // PMADDUBSW(X,Y) = 586 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1]))) 587 SmallVector<int> LoMask, HiMask; 588 for (unsigned I = 0; I != NumDstElts; ++I) { 589 LoMask.push_back(2 * I + 0); 590 HiMask.push_back(2 * I + 1); 591 } 592 593 auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask); 594 auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask); 595 auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask); 596 auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask); 597 598 auto LHSCast = 599 IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt; 600 LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy); 601 LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy); 602 RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy); 603 RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy); 604 Value *Lo = Builder.CreateMul(LHSLo, RHSLo); 605 Value *Hi = Builder.CreateMul(LHSHi, RHSHi); 606 return IsPMADDWD 607 ? Builder.CreateAdd(Lo, Hi) 608 : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi}); 609 } 610 611 static Value *simplifyX86movmsk(const IntrinsicInst &II, 612 InstCombiner::BuilderTy &Builder) { 613 Value *Arg = II.getArgOperand(0); 614 Type *ResTy = II.getType(); 615 616 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 617 if (isa<UndefValue>(Arg)) 618 return Constant::getNullValue(ResTy); 619 620 // Preserve previous behavior and give up. 621 // TODO: treat as <8 x i8>. 622 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) 623 return nullptr; 624 625 auto *ArgTy = cast<FixedVectorType>(Arg->getType()); 626 627 // Expand MOVMSK to compare/bitcast/zext: 628 // e.g. PMOVMSKB(v16i8 x): 629 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 630 // %int = bitcast <16 x i1> %cmp to i16 631 // %res = zext i16 %int to i32 632 unsigned NumElts = ArgTy->getNumElements(); 633 Type *IntegerTy = Builder.getIntNTy(NumElts); 634 635 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 636 Res = Builder.CreateIsNeg(Res); 637 Res = Builder.CreateBitCast(Res, IntegerTy); 638 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 639 return Res; 640 } 641 642 static Value *simplifyX86addcarry(const IntrinsicInst &II, 643 InstCombiner::BuilderTy &Builder) { 644 Value *CarryIn = II.getArgOperand(0); 645 Value *Op1 = II.getArgOperand(1); 646 Value *Op2 = II.getArgOperand(2); 647 Type *RetTy = II.getType(); 648 Type *OpTy = Op1->getType(); 649 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 650 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 651 "Unexpected types for x86 addcarry"); 652 653 // If carry-in is zero, this is just an unsigned add with overflow. 654 if (match(CarryIn, m_ZeroInt())) { 655 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 656 {Op1, Op2}); 657 // The types have to be adjusted to match the x86 call types. 658 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 659 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 660 Builder.getInt8Ty()); 661 Value *Res = PoisonValue::get(RetTy); 662 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 663 return Builder.CreateInsertValue(Res, UAddResult, 1); 664 } 665 666 return nullptr; 667 } 668 669 static Value *simplifyTernarylogic(const IntrinsicInst &II, 670 InstCombiner::BuilderTy &Builder) { 671 672 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3)); 673 if (!ArgImm || ArgImm->getValue().uge(256)) 674 return nullptr; 675 676 Value *ArgA = II.getArgOperand(0); 677 Value *ArgB = II.getArgOperand(1); 678 Value *ArgC = II.getArgOperand(2); 679 680 Type *Ty = II.getType(); 681 682 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 683 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second}; 684 }; 685 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 686 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second}; 687 }; 688 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 689 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second}; 690 }; 691 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> { 692 return {Builder.CreateNot(V.first), ~V.second}; 693 }; 694 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); }; 695 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); }; 696 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); }; 697 698 bool AIsConst = match(ArgA, m_ImmConstant()); 699 bool BIsConst = match(ArgB, m_ImmConstant()); 700 bool CIsConst = match(ArgC, m_ImmConstant()); 701 702 bool ABIsConst = AIsConst && BIsConst; 703 bool ACIsConst = AIsConst && CIsConst; 704 bool BCIsConst = BIsConst && CIsConst; 705 bool ABCIsConst = AIsConst && BIsConst && CIsConst; 706 707 // Use for verification. Its a big table. Its difficult to go from Imm -> 708 // logic ops, but easy to verify that a set of logic ops is correct. We track 709 // the logic ops through the second value in the pair. At the end it should 710 // equal Imm. 711 std::pair<Value *, uint8_t> A = {ArgA, 0xf0}; 712 std::pair<Value *, uint8_t> B = {ArgB, 0xcc}; 713 std::pair<Value *, uint8_t> C = {ArgC, 0xaa}; 714 std::pair<Value *, uint8_t> Res = {nullptr, 0}; 715 716 // Currently we only handle cases that convert directly to another instruction 717 // or cases where all the ops are constant. This is because we don't properly 718 // handle creating ternary ops in the backend, so splitting them here may 719 // cause regressions. As the backend improves, uncomment more cases. 720 721 uint8_t Imm = ArgImm->getValue().getZExtValue(); 722 switch (Imm) { 723 case 0x0: 724 Res = {Constant::getNullValue(Ty), 0}; 725 break; 726 case 0x1: 727 if (ABCIsConst) 728 Res = Nor(Or(A, B), C); 729 break; 730 case 0x2: 731 if (ABCIsConst) 732 Res = And(Nor(A, B), C); 733 break; 734 case 0x3: 735 if (ABIsConst) 736 Res = Nor(A, B); 737 break; 738 case 0x4: 739 if (ABCIsConst) 740 Res = And(Nor(A, C), B); 741 break; 742 case 0x5: 743 if (ACIsConst) 744 Res = Nor(A, C); 745 break; 746 case 0x6: 747 if (ABCIsConst) 748 Res = Nor(A, Xnor(B, C)); 749 break; 750 case 0x7: 751 if (ABCIsConst) 752 Res = Nor(A, And(B, C)); 753 break; 754 case 0x8: 755 if (ABCIsConst) 756 Res = Nor(A, Nand(B, C)); 757 break; 758 case 0x9: 759 if (ABCIsConst) 760 Res = Nor(A, Xor(B, C)); 761 break; 762 case 0xa: 763 if (ACIsConst) 764 Res = Nor(A, Not(C)); 765 break; 766 case 0xb: 767 if (ABCIsConst) 768 Res = Nor(A, Nor(C, Not(B))); 769 break; 770 case 0xc: 771 if (ABIsConst) 772 Res = Nor(A, Not(B)); 773 break; 774 case 0xd: 775 if (ABCIsConst) 776 Res = Nor(A, Nor(B, Not(C))); 777 break; 778 case 0xe: 779 if (ABCIsConst) 780 Res = Nor(A, Nor(B, C)); 781 break; 782 case 0xf: 783 Res = Not(A); 784 break; 785 case 0x10: 786 if (ABCIsConst) 787 Res = And(A, Nor(B, C)); 788 break; 789 case 0x11: 790 if (BCIsConst) 791 Res = Nor(B, C); 792 break; 793 case 0x12: 794 if (ABCIsConst) 795 Res = Nor(Xnor(A, C), B); 796 break; 797 case 0x13: 798 if (ABCIsConst) 799 Res = Nor(And(A, C), B); 800 break; 801 case 0x14: 802 if (ABCIsConst) 803 Res = Nor(Xnor(A, B), C); 804 break; 805 case 0x15: 806 if (ABCIsConst) 807 Res = Nor(And(A, B), C); 808 break; 809 case 0x16: 810 if (ABCIsConst) 811 Res = Xor(Xor(A, B), And(Nand(A, B), C)); 812 break; 813 case 0x17: 814 if (ABCIsConst) 815 Res = Xor(Or(A, B), Or(Xnor(A, B), C)); 816 break; 817 case 0x18: 818 if (ABCIsConst) 819 Res = Nor(Xnor(A, B), Xnor(A, C)); 820 break; 821 case 0x19: 822 if (ABCIsConst) 823 Res = And(Nand(A, B), Xnor(B, C)); 824 break; 825 case 0x1a: 826 if (ABCIsConst) 827 Res = Xor(A, Or(And(A, B), C)); 828 break; 829 case 0x1b: 830 if (ABCIsConst) 831 Res = Xor(A, Or(Xnor(A, B), C)); 832 break; 833 case 0x1c: 834 if (ABCIsConst) 835 Res = Xor(A, Or(And(A, C), B)); 836 break; 837 case 0x1d: 838 if (ABCIsConst) 839 Res = Xor(A, Or(Xnor(A, C), B)); 840 break; 841 case 0x1e: 842 if (ABCIsConst) 843 Res = Xor(A, Or(B, C)); 844 break; 845 case 0x1f: 846 if (ABCIsConst) 847 Res = Nand(A, Or(B, C)); 848 break; 849 case 0x20: 850 if (ABCIsConst) 851 Res = Nor(Nand(A, C), B); 852 break; 853 case 0x21: 854 if (ABCIsConst) 855 Res = Nor(Xor(A, C), B); 856 break; 857 case 0x22: 858 if (BCIsConst) 859 Res = Nor(B, Not(C)); 860 break; 861 case 0x23: 862 if (ABCIsConst) 863 Res = Nor(B, Nor(C, Not(A))); 864 break; 865 case 0x24: 866 if (ABCIsConst) 867 Res = Nor(Xnor(A, B), Xor(A, C)); 868 break; 869 case 0x25: 870 if (ABCIsConst) 871 Res = Xor(A, Nand(Nand(A, B), C)); 872 break; 873 case 0x26: 874 if (ABCIsConst) 875 Res = And(Nand(A, B), Xor(B, C)); 876 break; 877 case 0x27: 878 if (ABCIsConst) 879 Res = Xor(Or(Xnor(A, B), C), B); 880 break; 881 case 0x28: 882 if (ABCIsConst) 883 Res = And(Xor(A, B), C); 884 break; 885 case 0x29: 886 if (ABCIsConst) 887 Res = Xor(Xor(A, B), Nor(And(A, B), C)); 888 break; 889 case 0x2a: 890 if (ABCIsConst) 891 Res = And(Nand(A, B), C); 892 break; 893 case 0x2b: 894 if (ABCIsConst) 895 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A); 896 break; 897 case 0x2c: 898 if (ABCIsConst) 899 Res = Nor(Xnor(A, B), Nor(B, C)); 900 break; 901 case 0x2d: 902 if (ABCIsConst) 903 Res = Xor(A, Or(B, Not(C))); 904 break; 905 case 0x2e: 906 if (ABCIsConst) 907 Res = Xor(A, Or(Xor(A, C), B)); 908 break; 909 case 0x2f: 910 if (ABCIsConst) 911 Res = Nand(A, Or(B, Not(C))); 912 break; 913 case 0x30: 914 if (ABIsConst) 915 Res = Nor(B, Not(A)); 916 break; 917 case 0x31: 918 if (ABCIsConst) 919 Res = Nor(Nor(A, Not(C)), B); 920 break; 921 case 0x32: 922 if (ABCIsConst) 923 Res = Nor(Nor(A, C), B); 924 break; 925 case 0x33: 926 Res = Not(B); 927 break; 928 case 0x34: 929 if (ABCIsConst) 930 Res = And(Xor(A, B), Nand(B, C)); 931 break; 932 case 0x35: 933 if (ABCIsConst) 934 Res = Xor(B, Or(A, Xnor(B, C))); 935 break; 936 case 0x36: 937 if (ABCIsConst) 938 Res = Xor(Or(A, C), B); 939 break; 940 case 0x37: 941 if (ABCIsConst) 942 Res = Nand(Or(A, C), B); 943 break; 944 case 0x38: 945 if (ABCIsConst) 946 Res = Nor(Xnor(A, B), Nor(A, C)); 947 break; 948 case 0x39: 949 if (ABCIsConst) 950 Res = Xor(Or(A, Not(C)), B); 951 break; 952 case 0x3a: 953 if (ABCIsConst) 954 Res = Xor(B, Or(A, Xor(B, C))); 955 break; 956 case 0x3b: 957 if (ABCIsConst) 958 Res = Nand(Or(A, Not(C)), B); 959 break; 960 case 0x3c: 961 Res = Xor(A, B); 962 break; 963 case 0x3d: 964 if (ABCIsConst) 965 Res = Xor(A, Or(Nor(A, C), B)); 966 break; 967 case 0x3e: 968 if (ABCIsConst) 969 Res = Xor(A, Or(Nor(A, Not(C)), B)); 970 break; 971 case 0x3f: 972 if (ABIsConst) 973 Res = Nand(A, B); 974 break; 975 case 0x40: 976 if (ABCIsConst) 977 Res = Nor(Nand(A, B), C); 978 break; 979 case 0x41: 980 if (ABCIsConst) 981 Res = Nor(Xor(A, B), C); 982 break; 983 case 0x42: 984 if (ABCIsConst) 985 Res = Nor(Xor(A, B), Xnor(A, C)); 986 break; 987 case 0x43: 988 if (ABCIsConst) 989 Res = Xor(A, Nand(Nand(A, C), B)); 990 break; 991 case 0x44: 992 if (BCIsConst) 993 Res = Nor(C, Not(B)); 994 break; 995 case 0x45: 996 if (ABCIsConst) 997 Res = Nor(Nor(B, Not(A)), C); 998 break; 999 case 0x46: 1000 if (ABCIsConst) 1001 Res = Xor(Or(And(A, C), B), C); 1002 break; 1003 case 0x47: 1004 if (ABCIsConst) 1005 Res = Xor(Or(Xnor(A, C), B), C); 1006 break; 1007 case 0x48: 1008 if (ABCIsConst) 1009 Res = And(Xor(A, C), B); 1010 break; 1011 case 0x49: 1012 if (ABCIsConst) 1013 Res = Xor(Or(Xnor(A, B), And(A, C)), C); 1014 break; 1015 case 0x4a: 1016 if (ABCIsConst) 1017 Res = Nor(Xnor(A, C), Nor(B, C)); 1018 break; 1019 case 0x4b: 1020 if (ABCIsConst) 1021 Res = Xor(A, Or(C, Not(B))); 1022 break; 1023 case 0x4c: 1024 if (ABCIsConst) 1025 Res = And(Nand(A, C), B); 1026 break; 1027 case 0x4d: 1028 if (ABCIsConst) 1029 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A); 1030 break; 1031 case 0x4e: 1032 if (ABCIsConst) 1033 Res = Xor(A, Or(Xor(A, B), C)); 1034 break; 1035 case 0x4f: 1036 if (ABCIsConst) 1037 Res = Nand(A, Nand(B, Not(C))); 1038 break; 1039 case 0x50: 1040 if (ACIsConst) 1041 Res = Nor(C, Not(A)); 1042 break; 1043 case 0x51: 1044 if (ABCIsConst) 1045 Res = Nor(Nor(A, Not(B)), C); 1046 break; 1047 case 0x52: 1048 if (ABCIsConst) 1049 Res = And(Xor(A, C), Nand(B, C)); 1050 break; 1051 case 0x53: 1052 if (ABCIsConst) 1053 Res = Xor(Or(Xnor(B, C), A), C); 1054 break; 1055 case 0x54: 1056 if (ABCIsConst) 1057 Res = Nor(Nor(A, B), C); 1058 break; 1059 case 0x55: 1060 Res = Not(C); 1061 break; 1062 case 0x56: 1063 if (ABCIsConst) 1064 Res = Xor(Or(A, B), C); 1065 break; 1066 case 0x57: 1067 if (ABCIsConst) 1068 Res = Nand(Or(A, B), C); 1069 break; 1070 case 0x58: 1071 if (ABCIsConst) 1072 Res = Nor(Nor(A, B), Xnor(A, C)); 1073 break; 1074 case 0x59: 1075 if (ABCIsConst) 1076 Res = Xor(Or(A, Not(B)), C); 1077 break; 1078 case 0x5a: 1079 Res = Xor(A, C); 1080 break; 1081 case 0x5b: 1082 if (ABCIsConst) 1083 Res = Xor(A, Or(Nor(A, B), C)); 1084 break; 1085 case 0x5c: 1086 if (ABCIsConst) 1087 Res = Xor(Or(Xor(B, C), A), C); 1088 break; 1089 case 0x5d: 1090 if (ABCIsConst) 1091 Res = Nand(Or(A, Not(B)), C); 1092 break; 1093 case 0x5e: 1094 if (ABCIsConst) 1095 Res = Xor(A, Or(Nor(A, Not(B)), C)); 1096 break; 1097 case 0x5f: 1098 if (ACIsConst) 1099 Res = Nand(A, C); 1100 break; 1101 case 0x60: 1102 if (ABCIsConst) 1103 Res = And(A, Xor(B, C)); 1104 break; 1105 case 0x61: 1106 if (ABCIsConst) 1107 Res = Xor(Or(Xnor(A, B), And(B, C)), C); 1108 break; 1109 case 0x62: 1110 if (ABCIsConst) 1111 Res = Nor(Nor(A, C), Xnor(B, C)); 1112 break; 1113 case 0x63: 1114 if (ABCIsConst) 1115 Res = Xor(B, Or(C, Not(A))); 1116 break; 1117 case 0x64: 1118 if (ABCIsConst) 1119 Res = Nor(Nor(A, B), Xnor(B, C)); 1120 break; 1121 case 0x65: 1122 if (ABCIsConst) 1123 Res = Xor(Or(B, Not(A)), C); 1124 break; 1125 case 0x66: 1126 Res = Xor(B, C); 1127 break; 1128 case 0x67: 1129 if (ABCIsConst) 1130 Res = Or(Nor(A, B), Xor(B, C)); 1131 break; 1132 case 0x68: 1133 if (ABCIsConst) 1134 Res = Xor(Xor(A, B), Nor(Nor(A, B), C)); 1135 break; 1136 case 0x69: 1137 if (ABCIsConst) 1138 Res = Xor(Xnor(A, B), C); 1139 break; 1140 case 0x6a: 1141 if (ABCIsConst) 1142 Res = Xor(And(A, B), C); 1143 break; 1144 case 0x6b: 1145 if (ABCIsConst) 1146 Res = Or(Nor(A, B), Xor(Xnor(A, B), C)); 1147 break; 1148 case 0x6c: 1149 if (ABCIsConst) 1150 Res = Xor(And(A, C), B); 1151 break; 1152 case 0x6d: 1153 if (ABCIsConst) 1154 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C); 1155 break; 1156 case 0x6e: 1157 if (ABCIsConst) 1158 Res = Or(Nor(A, Not(B)), Xor(B, C)); 1159 break; 1160 case 0x6f: 1161 if (ABCIsConst) 1162 Res = Nand(A, Xnor(B, C)); 1163 break; 1164 case 0x70: 1165 if (ABCIsConst) 1166 Res = And(A, Nand(B, C)); 1167 break; 1168 case 0x71: 1169 if (ABCIsConst) 1170 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A); 1171 break; 1172 case 0x72: 1173 if (ABCIsConst) 1174 Res = Xor(Or(Xor(A, B), C), B); 1175 break; 1176 case 0x73: 1177 if (ABCIsConst) 1178 Res = Nand(Nand(A, Not(C)), B); 1179 break; 1180 case 0x74: 1181 if (ABCIsConst) 1182 Res = Xor(Or(Xor(A, C), B), C); 1183 break; 1184 case 0x75: 1185 if (ABCIsConst) 1186 Res = Nand(Nand(A, Not(B)), C); 1187 break; 1188 case 0x76: 1189 if (ABCIsConst) 1190 Res = Xor(B, Or(Nor(B, Not(A)), C)); 1191 break; 1192 case 0x77: 1193 if (BCIsConst) 1194 Res = Nand(B, C); 1195 break; 1196 case 0x78: 1197 if (ABCIsConst) 1198 Res = Xor(A, And(B, C)); 1199 break; 1200 case 0x79: 1201 if (ABCIsConst) 1202 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C); 1203 break; 1204 case 0x7a: 1205 if (ABCIsConst) 1206 Res = Or(Xor(A, C), Nor(B, Not(A))); 1207 break; 1208 case 0x7b: 1209 if (ABCIsConst) 1210 Res = Nand(Xnor(A, C), B); 1211 break; 1212 case 0x7c: 1213 if (ABCIsConst) 1214 Res = Or(Xor(A, B), Nor(C, Not(A))); 1215 break; 1216 case 0x7d: 1217 if (ABCIsConst) 1218 Res = Nand(Xnor(A, B), C); 1219 break; 1220 case 0x7e: 1221 if (ABCIsConst) 1222 Res = Or(Xor(A, B), Xor(A, C)); 1223 break; 1224 case 0x7f: 1225 if (ABCIsConst) 1226 Res = Nand(And(A, B), C); 1227 break; 1228 case 0x80: 1229 if (ABCIsConst) 1230 Res = And(And(A, B), C); 1231 break; 1232 case 0x81: 1233 if (ABCIsConst) 1234 Res = Nor(Xor(A, B), Xor(A, C)); 1235 break; 1236 case 0x82: 1237 if (ABCIsConst) 1238 Res = And(Xnor(A, B), C); 1239 break; 1240 case 0x83: 1241 if (ABCIsConst) 1242 Res = Nor(Xor(A, B), Nor(C, Not(A))); 1243 break; 1244 case 0x84: 1245 if (ABCIsConst) 1246 Res = And(Xnor(A, C), B); 1247 break; 1248 case 0x85: 1249 if (ABCIsConst) 1250 Res = Nor(Xor(A, C), Nor(B, Not(A))); 1251 break; 1252 case 0x86: 1253 if (ABCIsConst) 1254 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C); 1255 break; 1256 case 0x87: 1257 if (ABCIsConst) 1258 Res = Xor(A, Nand(B, C)); 1259 break; 1260 case 0x88: 1261 Res = And(B, C); 1262 break; 1263 case 0x89: 1264 if (ABCIsConst) 1265 Res = Xor(B, Nor(Nor(B, Not(A)), C)); 1266 break; 1267 case 0x8a: 1268 if (ABCIsConst) 1269 Res = And(Nand(A, Not(B)), C); 1270 break; 1271 case 0x8b: 1272 if (ABCIsConst) 1273 Res = Xor(Nor(Xor(A, C), B), C); 1274 break; 1275 case 0x8c: 1276 if (ABCIsConst) 1277 Res = And(Nand(A, Not(C)), B); 1278 break; 1279 case 0x8d: 1280 if (ABCIsConst) 1281 Res = Xor(Nor(Xor(A, B), C), B); 1282 break; 1283 case 0x8e: 1284 if (ABCIsConst) 1285 Res = Xor(Or(Xor(A, B), Xor(A, C)), A); 1286 break; 1287 case 0x8f: 1288 if (ABCIsConst) 1289 Res = Nand(A, Nand(B, C)); 1290 break; 1291 case 0x90: 1292 if (ABCIsConst) 1293 Res = And(A, Xnor(B, C)); 1294 break; 1295 case 0x91: 1296 if (ABCIsConst) 1297 Res = Nor(Nor(A, Not(B)), Xor(B, C)); 1298 break; 1299 case 0x92: 1300 if (ABCIsConst) 1301 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C); 1302 break; 1303 case 0x93: 1304 if (ABCIsConst) 1305 Res = Xor(Nand(A, C), B); 1306 break; 1307 case 0x94: 1308 if (ABCIsConst) 1309 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C)); 1310 break; 1311 case 0x95: 1312 if (ABCIsConst) 1313 Res = Xor(Nand(A, B), C); 1314 break; 1315 case 0x96: 1316 if (ABCIsConst) 1317 Res = Xor(Xor(A, B), C); 1318 break; 1319 case 0x97: 1320 if (ABCIsConst) 1321 Res = Xor(Xor(A, B), Or(Nor(A, B), C)); 1322 break; 1323 case 0x98: 1324 if (ABCIsConst) 1325 Res = Nor(Nor(A, B), Xor(B, C)); 1326 break; 1327 case 0x99: 1328 if (BCIsConst) 1329 Res = Xnor(B, C); 1330 break; 1331 case 0x9a: 1332 if (ABCIsConst) 1333 Res = Xor(Nor(B, Not(A)), C); 1334 break; 1335 case 0x9b: 1336 if (ABCIsConst) 1337 Res = Or(Nor(A, B), Xnor(B, C)); 1338 break; 1339 case 0x9c: 1340 if (ABCIsConst) 1341 Res = Xor(B, Nor(C, Not(A))); 1342 break; 1343 case 0x9d: 1344 if (ABCIsConst) 1345 Res = Or(Nor(A, C), Xnor(B, C)); 1346 break; 1347 case 0x9e: 1348 if (ABCIsConst) 1349 Res = Xor(And(Xor(A, B), Nand(B, C)), C); 1350 break; 1351 case 0x9f: 1352 if (ABCIsConst) 1353 Res = Nand(A, Xor(B, C)); 1354 break; 1355 case 0xa0: 1356 Res = And(A, C); 1357 break; 1358 case 0xa1: 1359 if (ABCIsConst) 1360 Res = Xor(A, Nor(Nor(A, Not(B)), C)); 1361 break; 1362 case 0xa2: 1363 if (ABCIsConst) 1364 Res = And(Or(A, Not(B)), C); 1365 break; 1366 case 0xa3: 1367 if (ABCIsConst) 1368 Res = Xor(Nor(Xor(B, C), A), C); 1369 break; 1370 case 0xa4: 1371 if (ABCIsConst) 1372 Res = Xor(A, Nor(Nor(A, B), C)); 1373 break; 1374 case 0xa5: 1375 if (ACIsConst) 1376 Res = Xnor(A, C); 1377 break; 1378 case 0xa6: 1379 if (ABCIsConst) 1380 Res = Xor(Nor(A, Not(B)), C); 1381 break; 1382 case 0xa7: 1383 if (ABCIsConst) 1384 Res = Or(Nor(A, B), Xnor(A, C)); 1385 break; 1386 case 0xa8: 1387 if (ABCIsConst) 1388 Res = And(Or(A, B), C); 1389 break; 1390 case 0xa9: 1391 if (ABCIsConst) 1392 Res = Xor(Nor(A, B), C); 1393 break; 1394 case 0xaa: 1395 Res = C; 1396 break; 1397 case 0xab: 1398 if (ABCIsConst) 1399 Res = Or(Nor(A, B), C); 1400 break; 1401 case 0xac: 1402 if (ABCIsConst) 1403 Res = Xor(Nor(Xnor(B, C), A), C); 1404 break; 1405 case 0xad: 1406 if (ABCIsConst) 1407 Res = Or(Xnor(A, C), And(B, C)); 1408 break; 1409 case 0xae: 1410 if (ABCIsConst) 1411 Res = Or(Nor(A, Not(B)), C); 1412 break; 1413 case 0xaf: 1414 if (ACIsConst) 1415 Res = Or(C, Not(A)); 1416 break; 1417 case 0xb0: 1418 if (ABCIsConst) 1419 Res = And(A, Nand(B, Not(C))); 1420 break; 1421 case 0xb1: 1422 if (ABCIsConst) 1423 Res = Xor(A, Nor(Xor(A, B), C)); 1424 break; 1425 case 0xb2: 1426 if (ABCIsConst) 1427 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A); 1428 break; 1429 case 0xb3: 1430 if (ABCIsConst) 1431 Res = Nand(Nand(A, C), B); 1432 break; 1433 case 0xb4: 1434 if (ABCIsConst) 1435 Res = Xor(A, Nor(C, Not(B))); 1436 break; 1437 case 0xb5: 1438 if (ABCIsConst) 1439 Res = Or(Xnor(A, C), Nor(B, C)); 1440 break; 1441 case 0xb6: 1442 if (ABCIsConst) 1443 Res = Xor(And(Xor(A, B), Nand(A, C)), C); 1444 break; 1445 case 0xb7: 1446 if (ABCIsConst) 1447 Res = Nand(Xor(A, C), B); 1448 break; 1449 case 0xb8: 1450 if (ABCIsConst) 1451 Res = Xor(Nor(Xnor(A, C), B), C); 1452 break; 1453 case 0xb9: 1454 if (ABCIsConst) 1455 Res = Xor(Nor(And(A, C), B), C); 1456 break; 1457 case 0xba: 1458 if (ABCIsConst) 1459 Res = Or(Nor(B, Not(A)), C); 1460 break; 1461 case 0xbb: 1462 if (BCIsConst) 1463 Res = Or(C, Not(B)); 1464 break; 1465 case 0xbc: 1466 if (ABCIsConst) 1467 Res = Xor(A, And(Nand(A, C), B)); 1468 break; 1469 case 0xbd: 1470 if (ABCIsConst) 1471 Res = Or(Xor(A, B), Xnor(A, C)); 1472 break; 1473 case 0xbe: 1474 if (ABCIsConst) 1475 Res = Or(Xor(A, B), C); 1476 break; 1477 case 0xbf: 1478 if (ABCIsConst) 1479 Res = Or(Nand(A, B), C); 1480 break; 1481 case 0xc0: 1482 Res = And(A, B); 1483 break; 1484 case 0xc1: 1485 if (ABCIsConst) 1486 Res = Xor(A, Nor(Nor(A, Not(C)), B)); 1487 break; 1488 case 0xc2: 1489 if (ABCIsConst) 1490 Res = Xor(A, Nor(Nor(A, C), B)); 1491 break; 1492 case 0xc3: 1493 if (ABIsConst) 1494 Res = Xnor(A, B); 1495 break; 1496 case 0xc4: 1497 if (ABCIsConst) 1498 Res = And(Or(A, Not(C)), B); 1499 break; 1500 case 0xc5: 1501 if (ABCIsConst) 1502 Res = Xor(B, Nor(A, Xor(B, C))); 1503 break; 1504 case 0xc6: 1505 if (ABCIsConst) 1506 Res = Xor(Nor(A, Not(C)), B); 1507 break; 1508 case 0xc7: 1509 if (ABCIsConst) 1510 Res = Or(Xnor(A, B), Nor(A, C)); 1511 break; 1512 case 0xc8: 1513 if (ABCIsConst) 1514 Res = And(Or(A, C), B); 1515 break; 1516 case 0xc9: 1517 if (ABCIsConst) 1518 Res = Xor(Nor(A, C), B); 1519 break; 1520 case 0xca: 1521 if (ABCIsConst) 1522 Res = Xor(B, Nor(A, Xnor(B, C))); 1523 break; 1524 case 0xcb: 1525 if (ABCIsConst) 1526 Res = Or(Xnor(A, B), And(B, C)); 1527 break; 1528 case 0xcc: 1529 Res = B; 1530 break; 1531 case 0xcd: 1532 if (ABCIsConst) 1533 Res = Or(Nor(A, C), B); 1534 break; 1535 case 0xce: 1536 if (ABCIsConst) 1537 Res = Or(Nor(A, Not(C)), B); 1538 break; 1539 case 0xcf: 1540 if (ABIsConst) 1541 Res = Or(B, Not(A)); 1542 break; 1543 case 0xd0: 1544 if (ABCIsConst) 1545 Res = And(A, Or(B, Not(C))); 1546 break; 1547 case 0xd1: 1548 if (ABCIsConst) 1549 Res = Xor(A, Nor(Xor(A, C), B)); 1550 break; 1551 case 0xd2: 1552 if (ABCIsConst) 1553 Res = Xor(A, Nor(B, Not(C))); 1554 break; 1555 case 0xd3: 1556 if (ABCIsConst) 1557 Res = Or(Xnor(A, B), Nor(B, C)); 1558 break; 1559 case 0xd4: 1560 if (ABCIsConst) 1561 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A); 1562 break; 1563 case 0xd5: 1564 if (ABCIsConst) 1565 Res = Nand(Nand(A, B), C); 1566 break; 1567 case 0xd6: 1568 if (ABCIsConst) 1569 Res = Xor(Xor(A, B), Or(And(A, B), C)); 1570 break; 1571 case 0xd7: 1572 if (ABCIsConst) 1573 Res = Nand(Xor(A, B), C); 1574 break; 1575 case 0xd8: 1576 if (ABCIsConst) 1577 Res = Xor(Nor(Xnor(A, B), C), B); 1578 break; 1579 case 0xd9: 1580 if (ABCIsConst) 1581 Res = Or(And(A, B), Xnor(B, C)); 1582 break; 1583 case 0xda: 1584 if (ABCIsConst) 1585 Res = Xor(A, And(Nand(A, B), C)); 1586 break; 1587 case 0xdb: 1588 if (ABCIsConst) 1589 Res = Or(Xnor(A, B), Xor(A, C)); 1590 break; 1591 case 0xdc: 1592 if (ABCIsConst) 1593 Res = Or(B, Nor(C, Not(A))); 1594 break; 1595 case 0xdd: 1596 if (BCIsConst) 1597 Res = Or(B, Not(C)); 1598 break; 1599 case 0xde: 1600 if (ABCIsConst) 1601 Res = Or(Xor(A, C), B); 1602 break; 1603 case 0xdf: 1604 if (ABCIsConst) 1605 Res = Or(Nand(A, C), B); 1606 break; 1607 case 0xe0: 1608 if (ABCIsConst) 1609 Res = And(A, Or(B, C)); 1610 break; 1611 case 0xe1: 1612 if (ABCIsConst) 1613 Res = Xor(A, Nor(B, C)); 1614 break; 1615 case 0xe2: 1616 if (ABCIsConst) 1617 Res = Xor(A, Nor(Xnor(A, C), B)); 1618 break; 1619 case 0xe3: 1620 if (ABCIsConst) 1621 Res = Xor(A, Nor(And(A, C), B)); 1622 break; 1623 case 0xe4: 1624 if (ABCIsConst) 1625 Res = Xor(A, Nor(Xnor(A, B), C)); 1626 break; 1627 case 0xe5: 1628 if (ABCIsConst) 1629 Res = Xor(A, Nor(And(A, B), C)); 1630 break; 1631 case 0xe6: 1632 if (ABCIsConst) 1633 Res = Or(And(A, B), Xor(B, C)); 1634 break; 1635 case 0xe7: 1636 if (ABCIsConst) 1637 Res = Or(Xnor(A, B), Xnor(A, C)); 1638 break; 1639 case 0xe8: 1640 if (ABCIsConst) 1641 Res = Xor(Or(A, B), Nor(Xnor(A, B), C)); 1642 break; 1643 case 0xe9: 1644 if (ABCIsConst) 1645 Res = Xor(Xor(A, B), Nand(Nand(A, B), C)); 1646 break; 1647 case 0xea: 1648 if (ABCIsConst) 1649 Res = Or(And(A, B), C); 1650 break; 1651 case 0xeb: 1652 if (ABCIsConst) 1653 Res = Or(Xnor(A, B), C); 1654 break; 1655 case 0xec: 1656 if (ABCIsConst) 1657 Res = Or(And(A, C), B); 1658 break; 1659 case 0xed: 1660 if (ABCIsConst) 1661 Res = Or(Xnor(A, C), B); 1662 break; 1663 case 0xee: 1664 Res = Or(B, C); 1665 break; 1666 case 0xef: 1667 if (ABCIsConst) 1668 Res = Nand(A, Nor(B, C)); 1669 break; 1670 case 0xf0: 1671 Res = A; 1672 break; 1673 case 0xf1: 1674 if (ABCIsConst) 1675 Res = Or(A, Nor(B, C)); 1676 break; 1677 case 0xf2: 1678 if (ABCIsConst) 1679 Res = Or(A, Nor(B, Not(C))); 1680 break; 1681 case 0xf3: 1682 if (ABIsConst) 1683 Res = Or(A, Not(B)); 1684 break; 1685 case 0xf4: 1686 if (ABCIsConst) 1687 Res = Or(A, Nor(C, Not(B))); 1688 break; 1689 case 0xf5: 1690 if (ACIsConst) 1691 Res = Or(A, Not(C)); 1692 break; 1693 case 0xf6: 1694 if (ABCIsConst) 1695 Res = Or(A, Xor(B, C)); 1696 break; 1697 case 0xf7: 1698 if (ABCIsConst) 1699 Res = Or(A, Nand(B, C)); 1700 break; 1701 case 0xf8: 1702 if (ABCIsConst) 1703 Res = Or(A, And(B, C)); 1704 break; 1705 case 0xf9: 1706 if (ABCIsConst) 1707 Res = Or(A, Xnor(B, C)); 1708 break; 1709 case 0xfa: 1710 Res = Or(A, C); 1711 break; 1712 case 0xfb: 1713 if (ABCIsConst) 1714 Res = Nand(Nor(A, C), B); 1715 break; 1716 case 0xfc: 1717 Res = Or(A, B); 1718 break; 1719 case 0xfd: 1720 if (ABCIsConst) 1721 Res = Nand(Nor(A, B), C); 1722 break; 1723 case 0xfe: 1724 if (ABCIsConst) 1725 Res = Or(Or(A, B), C); 1726 break; 1727 case 0xff: 1728 Res = {Constant::getAllOnesValue(Ty), 0xff}; 1729 break; 1730 } 1731 1732 assert((Res.first == nullptr || Res.second == Imm) && 1733 "Simplification of ternary logic does not verify!"); 1734 return Res.first; 1735 } 1736 1737 static Value *simplifyX86insertps(const IntrinsicInst &II, 1738 InstCombiner::BuilderTy &Builder) { 1739 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1740 if (!CInt) 1741 return nullptr; 1742 1743 auto *VecTy = cast<FixedVectorType>(II.getType()); 1744 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 1745 1746 // The immediate permute control byte looks like this: 1747 // [3:0] - zero mask for each 32-bit lane 1748 // [5:4] - select one 32-bit destination lane 1749 // [7:6] - select one 32-bit source lane 1750 1751 uint8_t Imm = CInt->getZExtValue(); 1752 uint8_t ZMask = Imm & 0xf; 1753 uint8_t DestLane = (Imm >> 4) & 0x3; 1754 uint8_t SourceLane = (Imm >> 6) & 0x3; 1755 1756 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 1757 1758 // If all zero mask bits are set, this was just a weird way to 1759 // generate a zero vector. 1760 if (ZMask == 0xf) 1761 return ZeroVector; 1762 1763 // Initialize by passing all of the first source bits through. 1764 int ShuffleMask[4] = {0, 1, 2, 3}; 1765 1766 // We may replace the second operand with the zero vector. 1767 Value *V1 = II.getArgOperand(1); 1768 1769 if (ZMask) { 1770 // If the zero mask is being used with a single input or the zero mask 1771 // overrides the destination lane, this is a shuffle with the zero vector. 1772 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 1773 (ZMask & (1 << DestLane))) { 1774 V1 = ZeroVector; 1775 // We may still move 32-bits of the first source vector from one lane 1776 // to another. 1777 ShuffleMask[DestLane] = SourceLane; 1778 // The zero mask may override the previous insert operation. 1779 for (unsigned i = 0; i < 4; ++i) 1780 if ((ZMask >> i) & 0x1) 1781 ShuffleMask[i] = i + 4; 1782 } else { 1783 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 1784 return nullptr; 1785 } 1786 } else { 1787 // Replace the selected destination lane with the selected source lane. 1788 ShuffleMask[DestLane] = SourceLane + 4; 1789 } 1790 1791 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 1792 } 1793 1794 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 1795 /// or conversion to a shuffle vector. 1796 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 1797 ConstantInt *CILength, ConstantInt *CIIndex, 1798 InstCombiner::BuilderTy &Builder) { 1799 auto LowConstantHighUndef = [&](uint64_t Val) { 1800 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1801 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 1802 UndefValue::get(IntTy64)}; 1803 return ConstantVector::get(Args); 1804 }; 1805 1806 // See if we're dealing with constant values. 1807 auto *C0 = dyn_cast<Constant>(Op0); 1808 auto *CI0 = 1809 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1810 : nullptr; 1811 1812 // Attempt to constant fold. 1813 if (CILength && CIIndex) { 1814 // From AMD documentation: "The bit index and field length are each six 1815 // bits in length other bits of the field are ignored." 1816 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 1817 APInt APLength = CILength->getValue().zextOrTrunc(6); 1818 1819 unsigned Index = APIndex.getZExtValue(); 1820 1821 // From AMD documentation: "a value of zero in the field length is 1822 // defined as length of 64". 1823 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1824 1825 // From AMD documentation: "If the sum of the bit index + length field 1826 // is greater than 64, the results are undefined". 1827 unsigned End = Index + Length; 1828 1829 // Note that both field index and field length are 8-bit quantities. 1830 // Since variables 'Index' and 'Length' are unsigned values 1831 // obtained from zero-extending field index and field length 1832 // respectively, their sum should never wrap around. 1833 if (End > 64) 1834 return UndefValue::get(II.getType()); 1835 1836 // If we are inserting whole bytes, we can convert this to a shuffle. 1837 // Lowering can recognize EXTRQI shuffle masks. 1838 if ((Length % 8) == 0 && (Index % 8) == 0) { 1839 // Convert bit indices to byte indices. 1840 Length /= 8; 1841 Index /= 8; 1842 1843 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1844 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1845 1846 SmallVector<int, 16> ShuffleMask; 1847 for (int i = 0; i != (int)Length; ++i) 1848 ShuffleMask.push_back(i + Index); 1849 for (int i = Length; i != 8; ++i) 1850 ShuffleMask.push_back(i + 16); 1851 for (int i = 8; i != 16; ++i) 1852 ShuffleMask.push_back(-1); 1853 1854 Value *SV = Builder.CreateShuffleVector( 1855 Builder.CreateBitCast(Op0, ShufTy), 1856 ConstantAggregateZero::get(ShufTy), ShuffleMask); 1857 return Builder.CreateBitCast(SV, II.getType()); 1858 } 1859 1860 // Constant Fold - shift Index'th bit to lowest position and mask off 1861 // Length bits. 1862 if (CI0) { 1863 APInt Elt = CI0->getValue(); 1864 Elt.lshrInPlace(Index); 1865 Elt = Elt.zextOrTrunc(Length); 1866 return LowConstantHighUndef(Elt.getZExtValue()); 1867 } 1868 1869 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 1870 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 1871 Value *Args[] = {Op0, CILength, CIIndex}; 1872 return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi, {}, Args); 1873 } 1874 } 1875 1876 // Constant Fold - extraction from zero is always {zero, undef}. 1877 if (CI0 && CI0->isZero()) 1878 return LowConstantHighUndef(0); 1879 1880 return nullptr; 1881 } 1882 1883 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 1884 /// folding or conversion to a shuffle vector. 1885 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 1886 APInt APLength, APInt APIndex, 1887 InstCombiner::BuilderTy &Builder) { 1888 // From AMD documentation: "The bit index and field length are each six bits 1889 // in length other bits of the field are ignored." 1890 APIndex = APIndex.zextOrTrunc(6); 1891 APLength = APLength.zextOrTrunc(6); 1892 1893 // Attempt to constant fold. 1894 unsigned Index = APIndex.getZExtValue(); 1895 1896 // From AMD documentation: "a value of zero in the field length is 1897 // defined as length of 64". 1898 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1899 1900 // From AMD documentation: "If the sum of the bit index + length field 1901 // is greater than 64, the results are undefined". 1902 unsigned End = Index + Length; 1903 1904 // Note that both field index and field length are 8-bit quantities. 1905 // Since variables 'Index' and 'Length' are unsigned values 1906 // obtained from zero-extending field index and field length 1907 // respectively, their sum should never wrap around. 1908 if (End > 64) 1909 return UndefValue::get(II.getType()); 1910 1911 // If we are inserting whole bytes, we can convert this to a shuffle. 1912 // Lowering can recognize INSERTQI shuffle masks. 1913 if ((Length % 8) == 0 && (Index % 8) == 0) { 1914 // Convert bit indices to byte indices. 1915 Length /= 8; 1916 Index /= 8; 1917 1918 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1919 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1920 1921 SmallVector<int, 16> ShuffleMask; 1922 for (int i = 0; i != (int)Index; ++i) 1923 ShuffleMask.push_back(i); 1924 for (int i = 0; i != (int)Length; ++i) 1925 ShuffleMask.push_back(i + 16); 1926 for (int i = Index + Length; i != 8; ++i) 1927 ShuffleMask.push_back(i); 1928 for (int i = 8; i != 16; ++i) 1929 ShuffleMask.push_back(-1); 1930 1931 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 1932 Builder.CreateBitCast(Op1, ShufTy), 1933 ShuffleMask); 1934 return Builder.CreateBitCast(SV, II.getType()); 1935 } 1936 1937 // See if we're dealing with constant values. 1938 auto *C0 = dyn_cast<Constant>(Op0); 1939 auto *C1 = dyn_cast<Constant>(Op1); 1940 auto *CI00 = 1941 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1942 : nullptr; 1943 auto *CI10 = 1944 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1945 : nullptr; 1946 1947 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 1948 if (CI00 && CI10) { 1949 APInt V00 = CI00->getValue(); 1950 APInt V10 = CI10->getValue(); 1951 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 1952 V00 = V00 & ~Mask; 1953 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 1954 APInt Val = V00 | V10; 1955 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1956 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 1957 UndefValue::get(IntTy64)}; 1958 return ConstantVector::get(Args); 1959 } 1960 1961 // If we were an INSERTQ call, we'll save demanded elements if we convert to 1962 // INSERTQI. 1963 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 1964 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1965 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 1966 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 1967 1968 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 1969 return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi, {}, Args); 1970 } 1971 1972 return nullptr; 1973 } 1974 1975 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 1976 static Value *simplifyX86pshufb(const IntrinsicInst &II, 1977 InstCombiner::BuilderTy &Builder) { 1978 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1979 if (!V) 1980 return nullptr; 1981 1982 auto *VecTy = cast<FixedVectorType>(II.getType()); 1983 unsigned NumElts = VecTy->getNumElements(); 1984 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 1985 "Unexpected number of elements in shuffle mask!"); 1986 1987 // Construct a shuffle mask from constant integers or UNDEFs. 1988 int Indexes[64]; 1989 1990 // Each byte in the shuffle control mask forms an index to permute the 1991 // corresponding byte in the destination operand. 1992 for (unsigned I = 0; I < NumElts; ++I) { 1993 Constant *COp = V->getAggregateElement(I); 1994 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1995 return nullptr; 1996 1997 if (isa<UndefValue>(COp)) { 1998 Indexes[I] = -1; 1999 continue; 2000 } 2001 2002 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 2003 2004 // If the most significant bit (bit[7]) of each byte of the shuffle 2005 // control mask is set, then zero is written in the result byte. 2006 // The zero vector is in the right-hand side of the resulting 2007 // shufflevector. 2008 2009 // The value of each index for the high 128-bit lane is the least 2010 // significant 4 bits of the respective shuffle control byte. 2011 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 2012 Indexes[I] = Index; 2013 } 2014 2015 auto V1 = II.getArgOperand(0); 2016 auto V2 = Constant::getNullValue(VecTy); 2017 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts)); 2018 } 2019 2020 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 2021 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 2022 InstCombiner::BuilderTy &Builder) { 2023 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2024 if (!V) 2025 return nullptr; 2026 2027 auto *VecTy = cast<FixedVectorType>(II.getType()); 2028 unsigned NumElts = VecTy->getNumElements(); 2029 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 2030 unsigned NumLaneElts = IsPD ? 2 : 4; 2031 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 2032 2033 // Construct a shuffle mask from constant integers or UNDEFs. 2034 int Indexes[16]; 2035 2036 // The intrinsics only read one or two bits, clear the rest. 2037 for (unsigned I = 0; I < NumElts; ++I) { 2038 Constant *COp = V->getAggregateElement(I); 2039 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2040 return nullptr; 2041 2042 if (isa<UndefValue>(COp)) { 2043 Indexes[I] = -1; 2044 continue; 2045 } 2046 2047 APInt Index = cast<ConstantInt>(COp)->getValue(); 2048 Index = Index.zextOrTrunc(32).getLoBits(2); 2049 2050 // The PD variants uses bit 1 to select per-lane element index, so 2051 // shift down to convert to generic shuffle mask index. 2052 if (IsPD) 2053 Index.lshrInPlace(1); 2054 2055 // The _256 variants are a bit trickier since the mask bits always index 2056 // into the corresponding 128 half. In order to convert to a generic 2057 // shuffle, we have to make that explicit. 2058 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 2059 2060 Indexes[I] = Index.getZExtValue(); 2061 } 2062 2063 auto V1 = II.getArgOperand(0); 2064 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts)); 2065 } 2066 2067 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 2068 static Value *simplifyX86vpermv(const IntrinsicInst &II, 2069 InstCombiner::BuilderTy &Builder) { 2070 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2071 if (!V) 2072 return nullptr; 2073 2074 auto *VecTy = cast<FixedVectorType>(II.getType()); 2075 unsigned Size = VecTy->getNumElements(); 2076 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 2077 "Unexpected shuffle mask size"); 2078 2079 // Construct a shuffle mask from constant integers or UNDEFs. 2080 int Indexes[64]; 2081 2082 for (unsigned I = 0; I < Size; ++I) { 2083 Constant *COp = V->getAggregateElement(I); 2084 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2085 return nullptr; 2086 2087 if (isa<UndefValue>(COp)) { 2088 Indexes[I] = -1; 2089 continue; 2090 } 2091 2092 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 2093 Index &= Size - 1; 2094 Indexes[I] = Index; 2095 } 2096 2097 auto V1 = II.getArgOperand(0); 2098 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size)); 2099 } 2100 2101 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant. 2102 static Value *simplifyX86vpermv3(const IntrinsicInst &II, 2103 InstCombiner::BuilderTy &Builder) { 2104 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2105 if (!V) 2106 return nullptr; 2107 2108 auto *VecTy = cast<FixedVectorType>(II.getType()); 2109 unsigned Size = VecTy->getNumElements(); 2110 assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 || 2111 Size == 64) && 2112 "Unexpected shuffle mask size"); 2113 2114 // Construct a shuffle mask from constant integers or UNDEFs. 2115 int Indexes[64]; 2116 2117 for (unsigned I = 0; I < Size; ++I) { 2118 Constant *COp = V->getAggregateElement(I); 2119 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2120 return nullptr; 2121 2122 if (isa<UndefValue>(COp)) { 2123 Indexes[I] = -1; 2124 continue; 2125 } 2126 2127 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 2128 Index &= (2 * Size) - 1; 2129 Indexes[I] = Index; 2130 } 2131 2132 auto V1 = II.getArgOperand(0); 2133 auto V2 = II.getArgOperand(2); 2134 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size)); 2135 } 2136 2137 // Simplify VPERMV/VPERMV3 mask - only demand the active index bits. 2138 static bool simplifyX86VPERMMask(Instruction *II, bool IsBinary, 2139 InstCombiner &IC) { 2140 auto *VecTy = cast<FixedVectorType>(II->getType()); 2141 unsigned EltSizeInBits = VecTy->getScalarSizeInBits(); 2142 unsigned NumElts = VecTy->getNumElements(); 2143 assert(isPowerOf2_32(NumElts) && isPowerOf2_32(EltSizeInBits) && 2144 "Unexpected shuffle mask size"); 2145 2146 unsigned IdxSizeInBits = Log2_32(IsBinary ? (2 * NumElts) : NumElts); 2147 APInt DemandedMask = APInt::getLowBitsSet(EltSizeInBits, IdxSizeInBits); 2148 2149 KnownBits KnownMask(EltSizeInBits); 2150 return IC.SimplifyDemandedBits(II, /*OpNo=*/1, DemandedMask, KnownMask); 2151 } 2152 2153 std::optional<Instruction *> 2154 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 2155 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 2156 unsigned DemandedWidth) { 2157 APInt UndefElts(Width, 0); 2158 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 2159 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 2160 }; 2161 2162 Intrinsic::ID IID = II.getIntrinsicID(); 2163 switch (IID) { 2164 case Intrinsic::x86_bmi_bextr_32: 2165 case Intrinsic::x86_bmi_bextr_64: 2166 case Intrinsic::x86_tbm_bextri_u32: 2167 case Intrinsic::x86_tbm_bextri_u64: 2168 // If the RHS is a constant we can try some simplifications. 2169 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2170 uint64_t Shift = C->getZExtValue(); 2171 uint64_t Length = (Shift >> 8) & 0xff; 2172 Shift &= 0xff; 2173 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2174 // If the length is 0 or the shift is out of range, replace with zero. 2175 if (Length == 0 || Shift >= BitWidth) { 2176 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2177 } 2178 // If the LHS is also a constant, we can completely constant fold this. 2179 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2180 uint64_t Result = InC->getZExtValue() >> Shift; 2181 if (Length > BitWidth) 2182 Length = BitWidth; 2183 Result &= maskTrailingOnes<uint64_t>(Length); 2184 return IC.replaceInstUsesWith(II, 2185 ConstantInt::get(II.getType(), Result)); 2186 } 2187 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2188 // are only masking bits that a shift already cleared? 2189 } 2190 break; 2191 2192 case Intrinsic::x86_bmi_bzhi_32: 2193 case Intrinsic::x86_bmi_bzhi_64: 2194 // If the RHS is a constant we can try some simplifications. 2195 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2196 uint64_t Index = C->getZExtValue() & 0xff; 2197 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2198 if (Index >= BitWidth) { 2199 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2200 } 2201 if (Index == 0) { 2202 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2203 } 2204 // If the LHS is also a constant, we can completely constant fold this. 2205 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2206 uint64_t Result = InC->getZExtValue(); 2207 Result &= maskTrailingOnes<uint64_t>(Index); 2208 return IC.replaceInstUsesWith(II, 2209 ConstantInt::get(II.getType(), Result)); 2210 } 2211 // TODO should we convert this to an AND if the RHS is constant? 2212 } 2213 break; 2214 case Intrinsic::x86_bmi_pext_32: 2215 case Intrinsic::x86_bmi_pext_64: 2216 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2217 if (MaskC->isNullValue()) { 2218 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2219 } 2220 if (MaskC->isAllOnesValue()) { 2221 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2222 } 2223 2224 unsigned MaskIdx, MaskLen; 2225 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2226 // any single contingous sequence of 1s anywhere in the mask simply 2227 // describes a subset of the input bits shifted to the appropriate 2228 // position. Replace with the straight forward IR. 2229 Value *Input = II.getArgOperand(0); 2230 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 2231 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2232 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 2233 return IC.replaceInstUsesWith(II, Shifted); 2234 } 2235 2236 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2237 uint64_t Src = SrcC->getZExtValue(); 2238 uint64_t Mask = MaskC->getZExtValue(); 2239 uint64_t Result = 0; 2240 uint64_t BitToSet = 1; 2241 2242 while (Mask) { 2243 // Isolate lowest set bit. 2244 uint64_t BitToTest = Mask & -Mask; 2245 if (BitToTest & Src) 2246 Result |= BitToSet; 2247 2248 BitToSet <<= 1; 2249 // Clear lowest set bit. 2250 Mask &= Mask - 1; 2251 } 2252 2253 return IC.replaceInstUsesWith(II, 2254 ConstantInt::get(II.getType(), Result)); 2255 } 2256 } 2257 break; 2258 case Intrinsic::x86_bmi_pdep_32: 2259 case Intrinsic::x86_bmi_pdep_64: 2260 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2261 if (MaskC->isNullValue()) { 2262 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2263 } 2264 if (MaskC->isAllOnesValue()) { 2265 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2266 } 2267 2268 unsigned MaskIdx, MaskLen; 2269 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2270 // any single contingous sequence of 1s anywhere in the mask simply 2271 // describes a subset of the input bits shifted to the appropriate 2272 // position. Replace with the straight forward IR. 2273 Value *Input = II.getArgOperand(0); 2274 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2275 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 2276 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 2277 return IC.replaceInstUsesWith(II, Masked); 2278 } 2279 2280 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2281 uint64_t Src = SrcC->getZExtValue(); 2282 uint64_t Mask = MaskC->getZExtValue(); 2283 uint64_t Result = 0; 2284 uint64_t BitToTest = 1; 2285 2286 while (Mask) { 2287 // Isolate lowest set bit. 2288 uint64_t BitToSet = Mask & -Mask; 2289 if (BitToTest & Src) 2290 Result |= BitToSet; 2291 2292 BitToTest <<= 1; 2293 // Clear lowest set bit; 2294 Mask &= Mask - 1; 2295 } 2296 2297 return IC.replaceInstUsesWith(II, 2298 ConstantInt::get(II.getType(), Result)); 2299 } 2300 } 2301 break; 2302 2303 case Intrinsic::x86_sse_cvtss2si: 2304 case Intrinsic::x86_sse_cvtss2si64: 2305 case Intrinsic::x86_sse_cvttss2si: 2306 case Intrinsic::x86_sse_cvttss2si64: 2307 case Intrinsic::x86_sse2_cvtsd2si: 2308 case Intrinsic::x86_sse2_cvtsd2si64: 2309 case Intrinsic::x86_sse2_cvttsd2si: 2310 case Intrinsic::x86_sse2_cvttsd2si64: 2311 case Intrinsic::x86_avx512_vcvtss2si32: 2312 case Intrinsic::x86_avx512_vcvtss2si64: 2313 case Intrinsic::x86_avx512_vcvtss2usi32: 2314 case Intrinsic::x86_avx512_vcvtss2usi64: 2315 case Intrinsic::x86_avx512_vcvtsd2si32: 2316 case Intrinsic::x86_avx512_vcvtsd2si64: 2317 case Intrinsic::x86_avx512_vcvtsd2usi32: 2318 case Intrinsic::x86_avx512_vcvtsd2usi64: 2319 case Intrinsic::x86_avx512_cvttss2si: 2320 case Intrinsic::x86_avx512_cvttss2si64: 2321 case Intrinsic::x86_avx512_cvttss2usi: 2322 case Intrinsic::x86_avx512_cvttss2usi64: 2323 case Intrinsic::x86_avx512_cvttsd2si: 2324 case Intrinsic::x86_avx512_cvttsd2si64: 2325 case Intrinsic::x86_avx512_cvttsd2usi: 2326 case Intrinsic::x86_avx512_cvttsd2usi64: { 2327 // These intrinsics only demand the 0th element of their input vectors. If 2328 // we can simplify the input based on that, do so now. 2329 Value *Arg = II.getArgOperand(0); 2330 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 2331 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2332 return IC.replaceOperand(II, 0, V); 2333 } 2334 break; 2335 } 2336 2337 case Intrinsic::x86_mmx_pmovmskb: 2338 case Intrinsic::x86_sse_movmsk_ps: 2339 case Intrinsic::x86_sse2_movmsk_pd: 2340 case Intrinsic::x86_sse2_pmovmskb_128: 2341 case Intrinsic::x86_avx_movmsk_pd_256: 2342 case Intrinsic::x86_avx_movmsk_ps_256: 2343 case Intrinsic::x86_avx2_pmovmskb: 2344 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 2345 return IC.replaceInstUsesWith(II, V); 2346 } 2347 break; 2348 2349 case Intrinsic::x86_sse_comieq_ss: 2350 case Intrinsic::x86_sse_comige_ss: 2351 case Intrinsic::x86_sse_comigt_ss: 2352 case Intrinsic::x86_sse_comile_ss: 2353 case Intrinsic::x86_sse_comilt_ss: 2354 case Intrinsic::x86_sse_comineq_ss: 2355 case Intrinsic::x86_sse_ucomieq_ss: 2356 case Intrinsic::x86_sse_ucomige_ss: 2357 case Intrinsic::x86_sse_ucomigt_ss: 2358 case Intrinsic::x86_sse_ucomile_ss: 2359 case Intrinsic::x86_sse_ucomilt_ss: 2360 case Intrinsic::x86_sse_ucomineq_ss: 2361 case Intrinsic::x86_sse2_comieq_sd: 2362 case Intrinsic::x86_sse2_comige_sd: 2363 case Intrinsic::x86_sse2_comigt_sd: 2364 case Intrinsic::x86_sse2_comile_sd: 2365 case Intrinsic::x86_sse2_comilt_sd: 2366 case Intrinsic::x86_sse2_comineq_sd: 2367 case Intrinsic::x86_sse2_ucomieq_sd: 2368 case Intrinsic::x86_sse2_ucomige_sd: 2369 case Intrinsic::x86_sse2_ucomigt_sd: 2370 case Intrinsic::x86_sse2_ucomile_sd: 2371 case Intrinsic::x86_sse2_ucomilt_sd: 2372 case Intrinsic::x86_sse2_ucomineq_sd: 2373 case Intrinsic::x86_avx512_vcomi_ss: 2374 case Intrinsic::x86_avx512_vcomi_sd: 2375 case Intrinsic::x86_avx512_mask_cmp_ss: 2376 case Intrinsic::x86_avx512_mask_cmp_sd: { 2377 // These intrinsics only demand the 0th element of their input vectors. If 2378 // we can simplify the input based on that, do so now. 2379 bool MadeChange = false; 2380 Value *Arg0 = II.getArgOperand(0); 2381 Value *Arg1 = II.getArgOperand(1); 2382 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2383 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2384 IC.replaceOperand(II, 0, V); 2385 MadeChange = true; 2386 } 2387 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2388 IC.replaceOperand(II, 1, V); 2389 MadeChange = true; 2390 } 2391 if (MadeChange) { 2392 return &II; 2393 } 2394 break; 2395 } 2396 2397 case Intrinsic::x86_avx512_add_ps_512: 2398 case Intrinsic::x86_avx512_div_ps_512: 2399 case Intrinsic::x86_avx512_mul_ps_512: 2400 case Intrinsic::x86_avx512_sub_ps_512: 2401 case Intrinsic::x86_avx512_add_pd_512: 2402 case Intrinsic::x86_avx512_div_pd_512: 2403 case Intrinsic::x86_avx512_mul_pd_512: 2404 case Intrinsic::x86_avx512_sub_pd_512: 2405 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2406 // IR operations. 2407 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2408 if (R->getValue() == 4) { 2409 Value *Arg0 = II.getArgOperand(0); 2410 Value *Arg1 = II.getArgOperand(1); 2411 2412 Value *V; 2413 switch (IID) { 2414 default: 2415 llvm_unreachable("Case stmts out of sync!"); 2416 case Intrinsic::x86_avx512_add_ps_512: 2417 case Intrinsic::x86_avx512_add_pd_512: 2418 V = IC.Builder.CreateFAdd(Arg0, Arg1); 2419 break; 2420 case Intrinsic::x86_avx512_sub_ps_512: 2421 case Intrinsic::x86_avx512_sub_pd_512: 2422 V = IC.Builder.CreateFSub(Arg0, Arg1); 2423 break; 2424 case Intrinsic::x86_avx512_mul_ps_512: 2425 case Intrinsic::x86_avx512_mul_pd_512: 2426 V = IC.Builder.CreateFMul(Arg0, Arg1); 2427 break; 2428 case Intrinsic::x86_avx512_div_ps_512: 2429 case Intrinsic::x86_avx512_div_pd_512: 2430 V = IC.Builder.CreateFDiv(Arg0, Arg1); 2431 break; 2432 } 2433 2434 return IC.replaceInstUsesWith(II, V); 2435 } 2436 } 2437 break; 2438 2439 case Intrinsic::x86_avx512_mask_add_ss_round: 2440 case Intrinsic::x86_avx512_mask_div_ss_round: 2441 case Intrinsic::x86_avx512_mask_mul_ss_round: 2442 case Intrinsic::x86_avx512_mask_sub_ss_round: 2443 case Intrinsic::x86_avx512_mask_add_sd_round: 2444 case Intrinsic::x86_avx512_mask_div_sd_round: 2445 case Intrinsic::x86_avx512_mask_mul_sd_round: 2446 case Intrinsic::x86_avx512_mask_sub_sd_round: 2447 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2448 // IR operations. 2449 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 2450 if (R->getValue() == 4) { 2451 // Extract the element as scalars. 2452 Value *Arg0 = II.getArgOperand(0); 2453 Value *Arg1 = II.getArgOperand(1); 2454 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 2455 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 2456 2457 Value *V; 2458 switch (IID) { 2459 default: 2460 llvm_unreachable("Case stmts out of sync!"); 2461 case Intrinsic::x86_avx512_mask_add_ss_round: 2462 case Intrinsic::x86_avx512_mask_add_sd_round: 2463 V = IC.Builder.CreateFAdd(LHS, RHS); 2464 break; 2465 case Intrinsic::x86_avx512_mask_sub_ss_round: 2466 case Intrinsic::x86_avx512_mask_sub_sd_round: 2467 V = IC.Builder.CreateFSub(LHS, RHS); 2468 break; 2469 case Intrinsic::x86_avx512_mask_mul_ss_round: 2470 case Intrinsic::x86_avx512_mask_mul_sd_round: 2471 V = IC.Builder.CreateFMul(LHS, RHS); 2472 break; 2473 case Intrinsic::x86_avx512_mask_div_ss_round: 2474 case Intrinsic::x86_avx512_mask_div_sd_round: 2475 V = IC.Builder.CreateFDiv(LHS, RHS); 2476 break; 2477 } 2478 2479 // Handle the masking aspect of the intrinsic. 2480 Value *Mask = II.getArgOperand(3); 2481 auto *C = dyn_cast<ConstantInt>(Mask); 2482 // We don't need a select if we know the mask bit is a 1. 2483 if (!C || !C->getValue()[0]) { 2484 // Cast the mask to an i1 vector and then extract the lowest element. 2485 auto *MaskTy = FixedVectorType::get( 2486 IC.Builder.getInt1Ty(), 2487 cast<IntegerType>(Mask->getType())->getBitWidth()); 2488 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 2489 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 2490 // Extract the lowest element from the passthru operand. 2491 Value *Passthru = 2492 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 2493 V = IC.Builder.CreateSelect(Mask, V, Passthru); 2494 } 2495 2496 // Insert the result back into the original argument 0. 2497 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2498 2499 return IC.replaceInstUsesWith(II, V); 2500 } 2501 } 2502 break; 2503 2504 // Constant fold ashr( <A x Bi>, Ci ). 2505 // Constant fold lshr( <A x Bi>, Ci ). 2506 // Constant fold shl( <A x Bi>, Ci ). 2507 case Intrinsic::x86_sse2_psrai_d: 2508 case Intrinsic::x86_sse2_psrai_w: 2509 case Intrinsic::x86_avx2_psrai_d: 2510 case Intrinsic::x86_avx2_psrai_w: 2511 case Intrinsic::x86_avx512_psrai_q_128: 2512 case Intrinsic::x86_avx512_psrai_q_256: 2513 case Intrinsic::x86_avx512_psrai_d_512: 2514 case Intrinsic::x86_avx512_psrai_q_512: 2515 case Intrinsic::x86_avx512_psrai_w_512: 2516 case Intrinsic::x86_sse2_psrli_d: 2517 case Intrinsic::x86_sse2_psrli_q: 2518 case Intrinsic::x86_sse2_psrli_w: 2519 case Intrinsic::x86_avx2_psrli_d: 2520 case Intrinsic::x86_avx2_psrli_q: 2521 case Intrinsic::x86_avx2_psrli_w: 2522 case Intrinsic::x86_avx512_psrli_d_512: 2523 case Intrinsic::x86_avx512_psrli_q_512: 2524 case Intrinsic::x86_avx512_psrli_w_512: 2525 case Intrinsic::x86_sse2_pslli_d: 2526 case Intrinsic::x86_sse2_pslli_q: 2527 case Intrinsic::x86_sse2_pslli_w: 2528 case Intrinsic::x86_avx2_pslli_d: 2529 case Intrinsic::x86_avx2_pslli_q: 2530 case Intrinsic::x86_avx2_pslli_w: 2531 case Intrinsic::x86_avx512_pslli_d_512: 2532 case Intrinsic::x86_avx512_pslli_q_512: 2533 case Intrinsic::x86_avx512_pslli_w_512: 2534 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2535 return IC.replaceInstUsesWith(II, V); 2536 } 2537 break; 2538 2539 case Intrinsic::x86_sse2_psra_d: 2540 case Intrinsic::x86_sse2_psra_w: 2541 case Intrinsic::x86_avx2_psra_d: 2542 case Intrinsic::x86_avx2_psra_w: 2543 case Intrinsic::x86_avx512_psra_q_128: 2544 case Intrinsic::x86_avx512_psra_q_256: 2545 case Intrinsic::x86_avx512_psra_d_512: 2546 case Intrinsic::x86_avx512_psra_q_512: 2547 case Intrinsic::x86_avx512_psra_w_512: 2548 case Intrinsic::x86_sse2_psrl_d: 2549 case Intrinsic::x86_sse2_psrl_q: 2550 case Intrinsic::x86_sse2_psrl_w: 2551 case Intrinsic::x86_avx2_psrl_d: 2552 case Intrinsic::x86_avx2_psrl_q: 2553 case Intrinsic::x86_avx2_psrl_w: 2554 case Intrinsic::x86_avx512_psrl_d_512: 2555 case Intrinsic::x86_avx512_psrl_q_512: 2556 case Intrinsic::x86_avx512_psrl_w_512: 2557 case Intrinsic::x86_sse2_psll_d: 2558 case Intrinsic::x86_sse2_psll_q: 2559 case Intrinsic::x86_sse2_psll_w: 2560 case Intrinsic::x86_avx2_psll_d: 2561 case Intrinsic::x86_avx2_psll_q: 2562 case Intrinsic::x86_avx2_psll_w: 2563 case Intrinsic::x86_avx512_psll_d_512: 2564 case Intrinsic::x86_avx512_psll_q_512: 2565 case Intrinsic::x86_avx512_psll_w_512: { 2566 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2567 return IC.replaceInstUsesWith(II, V); 2568 } 2569 2570 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2571 // operand to compute the shift amount. 2572 Value *Arg1 = II.getArgOperand(1); 2573 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2574 "Unexpected packed shift size"); 2575 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 2576 2577 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2578 return IC.replaceOperand(II, 1, V); 2579 } 2580 break; 2581 } 2582 2583 case Intrinsic::x86_avx2_psllv_d: 2584 case Intrinsic::x86_avx2_psllv_d_256: 2585 case Intrinsic::x86_avx2_psllv_q: 2586 case Intrinsic::x86_avx2_psllv_q_256: 2587 case Intrinsic::x86_avx512_psllv_d_512: 2588 case Intrinsic::x86_avx512_psllv_q_512: 2589 case Intrinsic::x86_avx512_psllv_w_128: 2590 case Intrinsic::x86_avx512_psllv_w_256: 2591 case Intrinsic::x86_avx512_psllv_w_512: 2592 case Intrinsic::x86_avx2_psrav_d: 2593 case Intrinsic::x86_avx2_psrav_d_256: 2594 case Intrinsic::x86_avx512_psrav_q_128: 2595 case Intrinsic::x86_avx512_psrav_q_256: 2596 case Intrinsic::x86_avx512_psrav_d_512: 2597 case Intrinsic::x86_avx512_psrav_q_512: 2598 case Intrinsic::x86_avx512_psrav_w_128: 2599 case Intrinsic::x86_avx512_psrav_w_256: 2600 case Intrinsic::x86_avx512_psrav_w_512: 2601 case Intrinsic::x86_avx2_psrlv_d: 2602 case Intrinsic::x86_avx2_psrlv_d_256: 2603 case Intrinsic::x86_avx2_psrlv_q: 2604 case Intrinsic::x86_avx2_psrlv_q_256: 2605 case Intrinsic::x86_avx512_psrlv_d_512: 2606 case Intrinsic::x86_avx512_psrlv_q_512: 2607 case Intrinsic::x86_avx512_psrlv_w_128: 2608 case Intrinsic::x86_avx512_psrlv_w_256: 2609 case Intrinsic::x86_avx512_psrlv_w_512: 2610 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 2611 return IC.replaceInstUsesWith(II, V); 2612 } 2613 break; 2614 2615 case Intrinsic::x86_sse2_packssdw_128: 2616 case Intrinsic::x86_sse2_packsswb_128: 2617 case Intrinsic::x86_avx2_packssdw: 2618 case Intrinsic::x86_avx2_packsswb: 2619 case Intrinsic::x86_avx512_packssdw_512: 2620 case Intrinsic::x86_avx512_packsswb_512: 2621 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 2622 return IC.replaceInstUsesWith(II, V); 2623 } 2624 break; 2625 2626 case Intrinsic::x86_sse2_packuswb_128: 2627 case Intrinsic::x86_sse41_packusdw: 2628 case Intrinsic::x86_avx2_packusdw: 2629 case Intrinsic::x86_avx2_packuswb: 2630 case Intrinsic::x86_avx512_packusdw_512: 2631 case Intrinsic::x86_avx512_packuswb_512: 2632 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 2633 return IC.replaceInstUsesWith(II, V); 2634 } 2635 break; 2636 2637 case Intrinsic::x86_sse2_pmulh_w: 2638 case Intrinsic::x86_avx2_pmulh_w: 2639 case Intrinsic::x86_avx512_pmulh_w_512: 2640 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, false)) { 2641 return IC.replaceInstUsesWith(II, V); 2642 } 2643 break; 2644 2645 case Intrinsic::x86_sse2_pmulhu_w: 2646 case Intrinsic::x86_avx2_pmulhu_w: 2647 case Intrinsic::x86_avx512_pmulhu_w_512: 2648 if (Value *V = simplifyX86pmulh(II, IC.Builder, false, false)) { 2649 return IC.replaceInstUsesWith(II, V); 2650 } 2651 break; 2652 2653 case Intrinsic::x86_ssse3_pmul_hr_sw_128: 2654 case Intrinsic::x86_avx2_pmul_hr_sw: 2655 case Intrinsic::x86_avx512_pmul_hr_sw_512: 2656 if (Value *V = simplifyX86pmulh(II, IC.Builder, true, true)) { 2657 return IC.replaceInstUsesWith(II, V); 2658 } 2659 break; 2660 2661 case Intrinsic::x86_sse2_pmadd_wd: 2662 case Intrinsic::x86_avx2_pmadd_wd: 2663 case Intrinsic::x86_avx512_pmaddw_d_512: 2664 if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) { 2665 return IC.replaceInstUsesWith(II, V); 2666 } 2667 break; 2668 2669 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 2670 case Intrinsic::x86_avx2_pmadd_ub_sw: 2671 case Intrinsic::x86_avx512_pmaddubs_w_512: 2672 if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) { 2673 return IC.replaceInstUsesWith(II, V); 2674 } 2675 break; 2676 2677 case Intrinsic::x86_pclmulqdq: 2678 case Intrinsic::x86_pclmulqdq_256: 2679 case Intrinsic::x86_pclmulqdq_512: { 2680 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2681 unsigned Imm = C->getZExtValue(); 2682 2683 bool MadeChange = false; 2684 Value *Arg0 = II.getArgOperand(0); 2685 Value *Arg1 = II.getArgOperand(1); 2686 unsigned VWidth = 2687 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2688 2689 APInt UndefElts1(VWidth, 0); 2690 APInt DemandedElts1 = 2691 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 2692 if (Value *V = 2693 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 2694 IC.replaceOperand(II, 0, V); 2695 MadeChange = true; 2696 } 2697 2698 APInt UndefElts2(VWidth, 0); 2699 APInt DemandedElts2 = 2700 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 2701 if (Value *V = 2702 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 2703 IC.replaceOperand(II, 1, V); 2704 MadeChange = true; 2705 } 2706 2707 // If either input elements are undef, the result is zero. 2708 if (DemandedElts1.isSubsetOf(UndefElts1) || 2709 DemandedElts2.isSubsetOf(UndefElts2)) { 2710 return IC.replaceInstUsesWith(II, 2711 ConstantAggregateZero::get(II.getType())); 2712 } 2713 2714 if (MadeChange) { 2715 return &II; 2716 } 2717 } 2718 break; 2719 } 2720 2721 case Intrinsic::x86_sse41_insertps: 2722 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 2723 return IC.replaceInstUsesWith(II, V); 2724 } 2725 break; 2726 2727 case Intrinsic::x86_sse4a_extrq: { 2728 Value *Op0 = II.getArgOperand(0); 2729 Value *Op1 = II.getArgOperand(1); 2730 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2731 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2732 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2733 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2734 VWidth1 == 16 && "Unexpected operand sizes"); 2735 2736 // See if we're dealing with constant values. 2737 auto *C1 = dyn_cast<Constant>(Op1); 2738 auto *CILength = 2739 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2740 : nullptr; 2741 auto *CIIndex = 2742 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2743 : nullptr; 2744 2745 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2746 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2747 return IC.replaceInstUsesWith(II, V); 2748 } 2749 2750 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2751 // operands and the lowest 16-bits of the second. 2752 bool MadeChange = false; 2753 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2754 IC.replaceOperand(II, 0, V); 2755 MadeChange = true; 2756 } 2757 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2758 IC.replaceOperand(II, 1, V); 2759 MadeChange = true; 2760 } 2761 if (MadeChange) { 2762 return &II; 2763 } 2764 break; 2765 } 2766 2767 case Intrinsic::x86_sse4a_extrqi: { 2768 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2769 // bits of the lower 64-bits. The upper 64-bits are undefined. 2770 Value *Op0 = II.getArgOperand(0); 2771 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2772 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2773 "Unexpected operand size"); 2774 2775 // See if we're dealing with constant values. 2776 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 2777 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2778 2779 // Attempt to simplify to a constant or shuffle vector. 2780 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2781 return IC.replaceInstUsesWith(II, V); 2782 } 2783 2784 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2785 // operand. 2786 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2787 return IC.replaceOperand(II, 0, V); 2788 } 2789 break; 2790 } 2791 2792 case Intrinsic::x86_sse4a_insertq: { 2793 Value *Op0 = II.getArgOperand(0); 2794 Value *Op1 = II.getArgOperand(1); 2795 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2796 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2797 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2798 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 2799 "Unexpected operand size"); 2800 2801 // See if we're dealing with constant values. 2802 auto *C1 = dyn_cast<Constant>(Op1); 2803 auto *CI11 = 2804 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2805 : nullptr; 2806 2807 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2808 if (CI11) { 2809 const APInt &V11 = CI11->getValue(); 2810 APInt Len = V11.zextOrTrunc(6); 2811 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2812 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2813 return IC.replaceInstUsesWith(II, V); 2814 } 2815 } 2816 2817 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2818 // operand. 2819 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2820 return IC.replaceOperand(II, 0, V); 2821 } 2822 break; 2823 } 2824 2825 case Intrinsic::x86_sse4a_insertqi: { 2826 // INSERTQI: Extract lowest Length bits from lower half of second source and 2827 // insert over first source starting at Index bit. The upper 64-bits are 2828 // undefined. 2829 Value *Op0 = II.getArgOperand(0); 2830 Value *Op1 = II.getArgOperand(1); 2831 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2832 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2833 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2834 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2835 VWidth1 == 2 && "Unexpected operand sizes"); 2836 2837 // See if we're dealing with constant values. 2838 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2839 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 2840 2841 // Attempt to simplify to a constant or shuffle vector. 2842 if (CILength && CIIndex) { 2843 APInt Len = CILength->getValue().zextOrTrunc(6); 2844 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 2845 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2846 return IC.replaceInstUsesWith(II, V); 2847 } 2848 } 2849 2850 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 2851 // operands. 2852 bool MadeChange = false; 2853 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2854 IC.replaceOperand(II, 0, V); 2855 MadeChange = true; 2856 } 2857 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 2858 IC.replaceOperand(II, 1, V); 2859 MadeChange = true; 2860 } 2861 if (MadeChange) { 2862 return &II; 2863 } 2864 break; 2865 } 2866 2867 case Intrinsic::x86_sse41_pblendvb: 2868 case Intrinsic::x86_sse41_blendvps: 2869 case Intrinsic::x86_sse41_blendvpd: 2870 case Intrinsic::x86_avx_blendv_ps_256: 2871 case Intrinsic::x86_avx_blendv_pd_256: 2872 case Intrinsic::x86_avx2_pblendvb: { 2873 // fold (blend A, A, Mask) -> A 2874 Value *Op0 = II.getArgOperand(0); 2875 Value *Op1 = II.getArgOperand(1); 2876 Value *Mask = II.getArgOperand(2); 2877 if (Op0 == Op1) { 2878 return IC.replaceInstUsesWith(II, Op0); 2879 } 2880 2881 // Zero Mask - select 1st argument. 2882 if (isa<ConstantAggregateZero>(Mask)) { 2883 return IC.replaceInstUsesWith(II, Op0); 2884 } 2885 2886 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 2887 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 2888 Constant *NewSelector = 2889 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout()); 2890 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 2891 } 2892 2893 Mask = InstCombiner::peekThroughBitcast(Mask); 2894 2895 // Peek through a one-use shuffle - VectorCombine should have simplified 2896 // this for cases where we're splitting wider vectors to use blendv 2897 // intrinsics. 2898 Value *MaskSrc = nullptr; 2899 ArrayRef<int> ShuffleMask; 2900 if (match(Mask, m_OneUse(m_Shuffle(m_Value(MaskSrc), m_Undef(), 2901 m_Mask(ShuffleMask))))) { 2902 // Bail if the shuffle was irregular or contains undefs. 2903 int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements(); 2904 if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) || 2905 any_of(ShuffleMask, 2906 [NumElts](int M) { return M < 0 || M >= NumElts; })) 2907 break; 2908 Mask = InstCombiner::peekThroughBitcast(MaskSrc); 2909 } 2910 2911 // Convert to a vector select if we can bypass casts and find a boolean 2912 // vector condition value. 2913 Value *BoolVec; 2914 if (match(Mask, m_SExt(m_Value(BoolVec))) && 2915 BoolVec->getType()->isVectorTy() && 2916 BoolVec->getType()->getScalarSizeInBits() == 1) { 2917 auto *MaskTy = cast<FixedVectorType>(Mask->getType()); 2918 auto *OpTy = cast<FixedVectorType>(II.getType()); 2919 unsigned NumMaskElts = MaskTy->getNumElements(); 2920 unsigned NumOperandElts = OpTy->getNumElements(); 2921 2922 // If we peeked through a shuffle, reapply the shuffle to the bool vector. 2923 if (MaskSrc) { 2924 unsigned NumMaskSrcElts = 2925 cast<FixedVectorType>(MaskSrc->getType())->getNumElements(); 2926 NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts; 2927 // Multiple mask bits maps to the same operand element - bail out. 2928 if (NumMaskElts > NumOperandElts) 2929 break; 2930 SmallVector<int> ScaledMask; 2931 if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask)) 2932 break; 2933 BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask); 2934 MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts); 2935 } 2936 assert(MaskTy->getPrimitiveSizeInBits() == 2937 OpTy->getPrimitiveSizeInBits() && 2938 "Not expecting mask and operands with different sizes"); 2939 2940 if (NumMaskElts == NumOperandElts) { 2941 return SelectInst::Create(BoolVec, Op1, Op0); 2942 } 2943 2944 // If the mask has less elements than the operands, each mask bit maps to 2945 // multiple elements of the operands. Bitcast back and forth. 2946 if (NumMaskElts < NumOperandElts) { 2947 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy); 2948 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy); 2949 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 2950 return new BitCastInst(Sel, II.getType()); 2951 } 2952 } 2953 2954 break; 2955 } 2956 2957 case Intrinsic::x86_ssse3_pshuf_b_128: 2958 case Intrinsic::x86_avx2_pshuf_b: 2959 case Intrinsic::x86_avx512_pshuf_b_512: { 2960 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 2961 return IC.replaceInstUsesWith(II, V); 2962 } 2963 2964 KnownBits KnownMask(8); 2965 if (IC.SimplifyDemandedBits(&II, 1, APInt(8, 0b10001111), KnownMask)) 2966 return &II; 2967 break; 2968 } 2969 2970 case Intrinsic::x86_avx_vpermilvar_ps: 2971 case Intrinsic::x86_avx_vpermilvar_ps_256: 2972 case Intrinsic::x86_avx512_vpermilvar_ps_512: { 2973 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 2974 return IC.replaceInstUsesWith(II, V); 2975 } 2976 2977 KnownBits KnownMask(32); 2978 if (IC.SimplifyDemandedBits(&II, 1, APInt(32, 0b00011), KnownMask)) 2979 return &II; 2980 break; 2981 } 2982 2983 case Intrinsic::x86_avx_vpermilvar_pd: 2984 case Intrinsic::x86_avx_vpermilvar_pd_256: 2985 case Intrinsic::x86_avx512_vpermilvar_pd_512: { 2986 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 2987 return IC.replaceInstUsesWith(II, V); 2988 } 2989 2990 KnownBits KnownMask(64); 2991 if (IC.SimplifyDemandedBits(&II, 1, APInt(64, 0b00010), KnownMask)) 2992 return &II; 2993 break; 2994 } 2995 2996 case Intrinsic::x86_avx2_permd: 2997 case Intrinsic::x86_avx2_permps: 2998 case Intrinsic::x86_avx512_permvar_df_256: 2999 case Intrinsic::x86_avx512_permvar_df_512: 3000 case Intrinsic::x86_avx512_permvar_di_256: 3001 case Intrinsic::x86_avx512_permvar_di_512: 3002 case Intrinsic::x86_avx512_permvar_hi_128: 3003 case Intrinsic::x86_avx512_permvar_hi_256: 3004 case Intrinsic::x86_avx512_permvar_hi_512: 3005 case Intrinsic::x86_avx512_permvar_qi_128: 3006 case Intrinsic::x86_avx512_permvar_qi_256: 3007 case Intrinsic::x86_avx512_permvar_qi_512: 3008 case Intrinsic::x86_avx512_permvar_sf_512: 3009 case Intrinsic::x86_avx512_permvar_si_512: 3010 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 3011 return IC.replaceInstUsesWith(II, V); 3012 } 3013 if (simplifyX86VPERMMask(&II, /*IsBinary=*/false, IC)) 3014 return &II; 3015 break; 3016 3017 case Intrinsic::x86_avx512_vpermi2var_d_128: 3018 case Intrinsic::x86_avx512_vpermi2var_d_256: 3019 case Intrinsic::x86_avx512_vpermi2var_d_512: 3020 case Intrinsic::x86_avx512_vpermi2var_hi_128: 3021 case Intrinsic::x86_avx512_vpermi2var_hi_256: 3022 case Intrinsic::x86_avx512_vpermi2var_hi_512: 3023 case Intrinsic::x86_avx512_vpermi2var_pd_128: 3024 case Intrinsic::x86_avx512_vpermi2var_pd_256: 3025 case Intrinsic::x86_avx512_vpermi2var_pd_512: 3026 case Intrinsic::x86_avx512_vpermi2var_ps_128: 3027 case Intrinsic::x86_avx512_vpermi2var_ps_256: 3028 case Intrinsic::x86_avx512_vpermi2var_ps_512: 3029 case Intrinsic::x86_avx512_vpermi2var_q_128: 3030 case Intrinsic::x86_avx512_vpermi2var_q_256: 3031 case Intrinsic::x86_avx512_vpermi2var_q_512: 3032 case Intrinsic::x86_avx512_vpermi2var_qi_128: 3033 case Intrinsic::x86_avx512_vpermi2var_qi_256: 3034 case Intrinsic::x86_avx512_vpermi2var_qi_512: 3035 if (Value *V = simplifyX86vpermv3(II, IC.Builder)) { 3036 return IC.replaceInstUsesWith(II, V); 3037 } 3038 if (simplifyX86VPERMMask(&II, /*IsBinary=*/true, IC)) 3039 return &II; 3040 break; 3041 3042 case Intrinsic::x86_avx_maskload_ps: 3043 case Intrinsic::x86_avx_maskload_pd: 3044 case Intrinsic::x86_avx_maskload_ps_256: 3045 case Intrinsic::x86_avx_maskload_pd_256: 3046 case Intrinsic::x86_avx2_maskload_d: 3047 case Intrinsic::x86_avx2_maskload_q: 3048 case Intrinsic::x86_avx2_maskload_d_256: 3049 case Intrinsic::x86_avx2_maskload_q_256: 3050 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 3051 return I; 3052 } 3053 break; 3054 3055 case Intrinsic::x86_sse2_maskmov_dqu: 3056 case Intrinsic::x86_avx_maskstore_ps: 3057 case Intrinsic::x86_avx_maskstore_pd: 3058 case Intrinsic::x86_avx_maskstore_ps_256: 3059 case Intrinsic::x86_avx_maskstore_pd_256: 3060 case Intrinsic::x86_avx2_maskstore_d: 3061 case Intrinsic::x86_avx2_maskstore_q: 3062 case Intrinsic::x86_avx2_maskstore_d_256: 3063 case Intrinsic::x86_avx2_maskstore_q_256: 3064 if (simplifyX86MaskedStore(II, IC)) { 3065 return nullptr; 3066 } 3067 break; 3068 3069 case Intrinsic::x86_addcarry_32: 3070 case Intrinsic::x86_addcarry_64: 3071 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 3072 return IC.replaceInstUsesWith(II, V); 3073 } 3074 break; 3075 3076 case Intrinsic::x86_avx512_pternlog_d_128: 3077 case Intrinsic::x86_avx512_pternlog_d_256: 3078 case Intrinsic::x86_avx512_pternlog_d_512: 3079 case Intrinsic::x86_avx512_pternlog_q_128: 3080 case Intrinsic::x86_avx512_pternlog_q_256: 3081 case Intrinsic::x86_avx512_pternlog_q_512: 3082 if (Value *V = simplifyTernarylogic(II, IC.Builder)) { 3083 return IC.replaceInstUsesWith(II, V); 3084 } 3085 break; 3086 default: 3087 break; 3088 } 3089 return std::nullopt; 3090 } 3091 3092 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 3093 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 3094 bool &KnownBitsComputed) const { 3095 switch (II.getIntrinsicID()) { 3096 default: 3097 break; 3098 case Intrinsic::x86_mmx_pmovmskb: 3099 case Intrinsic::x86_sse_movmsk_ps: 3100 case Intrinsic::x86_sse2_movmsk_pd: 3101 case Intrinsic::x86_sse2_pmovmskb_128: 3102 case Intrinsic::x86_avx_movmsk_ps_256: 3103 case Intrinsic::x86_avx_movmsk_pd_256: 3104 case Intrinsic::x86_avx2_pmovmskb: { 3105 // MOVMSK copies the vector elements' sign bits to the low bits 3106 // and zeros the high bits. 3107 unsigned ArgWidth; 3108 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 3109 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 3110 } else { 3111 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 3112 ArgWidth = ArgType->getNumElements(); 3113 } 3114 3115 // If we don't need any of low bits then return zero, 3116 // we know that DemandedMask is non-zero already. 3117 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 3118 Type *VTy = II.getType(); 3119 if (DemandedElts.isZero()) { 3120 return ConstantInt::getNullValue(VTy); 3121 } 3122 3123 // We know that the upper bits are set to zero. 3124 Known.Zero.setBitsFrom(ArgWidth); 3125 KnownBitsComputed = true; 3126 break; 3127 } 3128 } 3129 return std::nullopt; 3130 } 3131 3132 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 3133 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 3134 APInt &UndefElts2, APInt &UndefElts3, 3135 std::function<void(Instruction *, unsigned, APInt, APInt &)> 3136 simplifyAndSetOp) const { 3137 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 3138 switch (II.getIntrinsicID()) { 3139 default: 3140 break; 3141 case Intrinsic::x86_xop_vfrcz_ss: 3142 case Intrinsic::x86_xop_vfrcz_sd: 3143 // The instructions for these intrinsics are speced to zero upper bits not 3144 // pass them through like other scalar intrinsics. So we shouldn't just 3145 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 3146 // Instead we should return a zero vector. 3147 if (!DemandedElts[0]) { 3148 IC.addToWorklist(&II); 3149 return ConstantAggregateZero::get(II.getType()); 3150 } 3151 3152 // Only the lower element is used. 3153 DemandedElts = 1; 3154 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3155 3156 // Only the lower element is undefined. The high elements are zero. 3157 UndefElts = UndefElts[0]; 3158 break; 3159 3160 // Unary scalar-as-vector operations that work column-wise. 3161 case Intrinsic::x86_sse_rcp_ss: 3162 case Intrinsic::x86_sse_rsqrt_ss: 3163 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3164 3165 // If lowest element of a scalar op isn't used then use Arg0. 3166 if (!DemandedElts[0]) { 3167 IC.addToWorklist(&II); 3168 return II.getArgOperand(0); 3169 } 3170 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 3171 // checks). 3172 break; 3173 3174 // Binary scalar-as-vector operations that work column-wise. The high 3175 // elements come from operand 0. The low element is a function of both 3176 // operands. 3177 case Intrinsic::x86_sse_min_ss: 3178 case Intrinsic::x86_sse_max_ss: 3179 case Intrinsic::x86_sse_cmp_ss: 3180 case Intrinsic::x86_sse2_min_sd: 3181 case Intrinsic::x86_sse2_max_sd: 3182 case Intrinsic::x86_sse2_cmp_sd: { 3183 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3184 3185 // If lowest element of a scalar op isn't used then use Arg0. 3186 if (!DemandedElts[0]) { 3187 IC.addToWorklist(&II); 3188 return II.getArgOperand(0); 3189 } 3190 3191 // Only lower element is used for operand 1. 3192 DemandedElts = 1; 3193 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3194 3195 // Lower element is undefined if both lower elements are undefined. 3196 // Consider things like undef&0. The result is known zero, not undef. 3197 if (!UndefElts2[0]) 3198 UndefElts.clearBit(0); 3199 3200 break; 3201 } 3202 3203 // Binary scalar-as-vector operations that work column-wise. The high 3204 // elements come from operand 0 and the low element comes from operand 1. 3205 case Intrinsic::x86_sse41_round_ss: 3206 case Intrinsic::x86_sse41_round_sd: { 3207 // Don't use the low element of operand 0. 3208 APInt DemandedElts2 = DemandedElts; 3209 DemandedElts2.clearBit(0); 3210 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 3211 3212 // If lowest element of a scalar op isn't used then use Arg0. 3213 if (!DemandedElts[0]) { 3214 IC.addToWorklist(&II); 3215 return II.getArgOperand(0); 3216 } 3217 3218 // Only lower element is used for operand 1. 3219 DemandedElts = 1; 3220 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3221 3222 // Take the high undef elements from operand 0 and take the lower element 3223 // from operand 1. 3224 UndefElts.clearBit(0); 3225 UndefElts |= UndefElts2[0]; 3226 break; 3227 } 3228 3229 // Three input scalar-as-vector operations that work column-wise. The high 3230 // elements come from operand 0 and the low element is a function of all 3231 // three inputs. 3232 case Intrinsic::x86_avx512_mask_add_ss_round: 3233 case Intrinsic::x86_avx512_mask_div_ss_round: 3234 case Intrinsic::x86_avx512_mask_mul_ss_round: 3235 case Intrinsic::x86_avx512_mask_sub_ss_round: 3236 case Intrinsic::x86_avx512_mask_max_ss_round: 3237 case Intrinsic::x86_avx512_mask_min_ss_round: 3238 case Intrinsic::x86_avx512_mask_add_sd_round: 3239 case Intrinsic::x86_avx512_mask_div_sd_round: 3240 case Intrinsic::x86_avx512_mask_mul_sd_round: 3241 case Intrinsic::x86_avx512_mask_sub_sd_round: 3242 case Intrinsic::x86_avx512_mask_max_sd_round: 3243 case Intrinsic::x86_avx512_mask_min_sd_round: 3244 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3245 3246 // If lowest element of a scalar op isn't used then use Arg0. 3247 if (!DemandedElts[0]) { 3248 IC.addToWorklist(&II); 3249 return II.getArgOperand(0); 3250 } 3251 3252 // Only lower element is used for operand 1 and 2. 3253 DemandedElts = 1; 3254 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3255 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 3256 3257 // Lower element is undefined if all three lower elements are undefined. 3258 // Consider things like undef&0. The result is known zero, not undef. 3259 if (!UndefElts2[0] || !UndefElts3[0]) 3260 UndefElts.clearBit(0); 3261 break; 3262 3263 // TODO: Add fmaddsub support? 3264 case Intrinsic::x86_sse3_addsub_pd: 3265 case Intrinsic::x86_sse3_addsub_ps: 3266 case Intrinsic::x86_avx_addsub_pd_256: 3267 case Intrinsic::x86_avx_addsub_ps_256: { 3268 // If none of the even or none of the odd lanes are required, turn this 3269 // into a generic FP math instruction. 3270 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 3271 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 3272 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 3273 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 3274 if (IsSubOnly || IsAddOnly) { 3275 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 3276 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 3277 IC.Builder.SetInsertPoint(&II); 3278 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 3279 return IC.Builder.CreateBinOp( 3280 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 3281 } 3282 3283 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3284 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3285 UndefElts &= UndefElts2; 3286 break; 3287 } 3288 3289 // General per-element vector operations. 3290 case Intrinsic::x86_avx2_psllv_d: 3291 case Intrinsic::x86_avx2_psllv_d_256: 3292 case Intrinsic::x86_avx2_psllv_q: 3293 case Intrinsic::x86_avx2_psllv_q_256: 3294 case Intrinsic::x86_avx2_psrlv_d: 3295 case Intrinsic::x86_avx2_psrlv_d_256: 3296 case Intrinsic::x86_avx2_psrlv_q: 3297 case Intrinsic::x86_avx2_psrlv_q_256: 3298 case Intrinsic::x86_avx2_psrav_d: 3299 case Intrinsic::x86_avx2_psrav_d_256: { 3300 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3301 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3302 UndefElts &= UndefElts2; 3303 break; 3304 } 3305 3306 case Intrinsic::x86_sse2_pmulh_w: 3307 case Intrinsic::x86_avx2_pmulh_w: 3308 case Intrinsic::x86_avx512_pmulh_w_512: 3309 case Intrinsic::x86_sse2_pmulhu_w: 3310 case Intrinsic::x86_avx2_pmulhu_w: 3311 case Intrinsic::x86_avx512_pmulhu_w_512: 3312 case Intrinsic::x86_ssse3_pmul_hr_sw_128: 3313 case Intrinsic::x86_avx2_pmul_hr_sw: 3314 case Intrinsic::x86_avx512_pmul_hr_sw_512: { 3315 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3316 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3317 // NOTE: mulh(undef,undef) != undef. 3318 break; 3319 } 3320 3321 case Intrinsic::x86_sse2_packssdw_128: 3322 case Intrinsic::x86_sse2_packsswb_128: 3323 case Intrinsic::x86_sse2_packuswb_128: 3324 case Intrinsic::x86_sse41_packusdw: 3325 case Intrinsic::x86_avx2_packssdw: 3326 case Intrinsic::x86_avx2_packsswb: 3327 case Intrinsic::x86_avx2_packusdw: 3328 case Intrinsic::x86_avx2_packuswb: 3329 case Intrinsic::x86_avx512_packssdw_512: 3330 case Intrinsic::x86_avx512_packsswb_512: 3331 case Intrinsic::x86_avx512_packusdw_512: 3332 case Intrinsic::x86_avx512_packuswb_512: { 3333 auto *Ty0 = II.getArgOperand(0)->getType(); 3334 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 3335 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 3336 3337 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 3338 unsigned VWidthPerLane = VWidth / NumLanes; 3339 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 3340 3341 // Per lane, pack the elements of the first input and then the second. 3342 // e.g. 3343 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 3344 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 3345 for (int OpNum = 0; OpNum != 2; ++OpNum) { 3346 APInt OpDemandedElts(InnerVWidth, 0); 3347 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3348 unsigned LaneIdx = Lane * VWidthPerLane; 3349 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 3350 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 3351 if (DemandedElts[Idx]) 3352 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 3353 } 3354 } 3355 3356 // Demand elements from the operand. 3357 APInt OpUndefElts(InnerVWidth, 0); 3358 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 3359 3360 // Pack the operand's UNDEF elements, one lane at a time. 3361 OpUndefElts = OpUndefElts.zext(VWidth); 3362 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3363 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 3364 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 3365 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 3366 UndefElts |= LaneElts; 3367 } 3368 } 3369 break; 3370 } 3371 3372 case Intrinsic::x86_sse2_pmadd_wd: 3373 case Intrinsic::x86_avx2_pmadd_wd: 3374 case Intrinsic::x86_avx512_pmaddw_d_512: 3375 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 3376 case Intrinsic::x86_avx2_pmadd_ub_sw: 3377 case Intrinsic::x86_avx512_pmaddubs_w_512: { 3378 // PMADD - demand both src elements that map to each dst element. 3379 auto *ArgTy = II.getArgOperand(0)->getType(); 3380 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements(); 3381 assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); 3382 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth); 3383 APInt Op0UndefElts(InnerVWidth, 0); 3384 APInt Op1UndefElts(InnerVWidth, 0); 3385 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts); 3386 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts); 3387 // NOTE: madd(undef,undef) != undef. 3388 break; 3389 } 3390 3391 // PSHUFB 3392 case Intrinsic::x86_ssse3_pshuf_b_128: 3393 case Intrinsic::x86_avx2_pshuf_b: 3394 case Intrinsic::x86_avx512_pshuf_b_512: 3395 // PERMILVAR 3396 case Intrinsic::x86_avx_vpermilvar_ps: 3397 case Intrinsic::x86_avx_vpermilvar_ps_256: 3398 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3399 case Intrinsic::x86_avx_vpermilvar_pd: 3400 case Intrinsic::x86_avx_vpermilvar_pd_256: 3401 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3402 // PERMV 3403 case Intrinsic::x86_avx2_permd: 3404 case Intrinsic::x86_avx2_permps: { 3405 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 3406 break; 3407 } 3408 3409 // SSE4A instructions leave the upper 64-bits of the 128-bit result 3410 // in an undefined state. 3411 case Intrinsic::x86_sse4a_extrq: 3412 case Intrinsic::x86_sse4a_extrqi: 3413 case Intrinsic::x86_sse4a_insertq: 3414 case Intrinsic::x86_sse4a_insertqi: 3415 UndefElts.setHighBits(VWidth / 2); 3416 break; 3417 } 3418 return std::nullopt; 3419 } 3420