1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements a TargetTransformInfo analysis pass specific to the 10 /// X86 target machine. It uses the target's detailed information to provide 11 /// more precise answers to certain TTI queries, while letting the target 12 /// independent and default TTI implementations handle the rest. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "X86TargetTransformInfo.h" 17 #include "llvm/IR/IntrinsicInst.h" 18 #include "llvm/IR/IntrinsicsX86.h" 19 #include "llvm/Support/KnownBits.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 #include <optional> 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "x86tti" 26 27 /// Return a constant boolean vector that has true elements in all positions 28 /// where the input constant data vector has an element with the sign bit set. 29 static Constant *getNegativeIsTrueBoolVec(Constant *V, const DataLayout &DL) { 30 VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType())); 31 V = ConstantExpr::getBitCast(V, IntTy); 32 V = ConstantFoldCompareInstOperands(CmpInst::ICMP_SGT, 33 Constant::getNullValue(IntTy), V, DL); 34 assert(V && "Vector must be foldable"); 35 return V; 36 } 37 38 /// Convert the x86 XMM integer vector mask to a vector of bools based on 39 /// each element's most significant bit (the sign bit). 40 static Value *getBoolVecFromMask(Value *Mask, const DataLayout &DL) { 41 // Fold Constant Mask. 42 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) 43 return getNegativeIsTrueBoolVec(ConstantMask, DL); 44 45 // Mask was extended from a boolean vector. 46 Value *ExtMask; 47 if (PatternMatch::match( 48 Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) && 49 ExtMask->getType()->isIntOrIntVectorTy(1)) 50 return ExtMask; 51 52 return nullptr; 53 } 54 55 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 56 // XMM register mask efficiently, we could transform all x86 masked intrinsics 57 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 58 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { 59 Value *Ptr = II.getOperand(0); 60 Value *Mask = II.getOperand(1); 61 Constant *ZeroVec = Constant::getNullValue(II.getType()); 62 63 // Zero Mask - masked load instruction creates a zero vector. 64 if (isa<ConstantAggregateZero>(Mask)) 65 return IC.replaceInstUsesWith(II, ZeroVec); 66 67 // The mask is constant or extended from a bool vector. Convert this x86 68 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 69 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 70 // First, cast the x86 intrinsic scalar pointer to a vector pointer to match 71 // the LLVM intrinsic definition for the pointer argument. 72 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 73 PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace); 74 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 75 76 // The pass-through vector for an x86 masked load is a zero vector. 77 CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad( 78 II.getType(), PtrCast, Align(1), BoolMask, ZeroVec); 79 return IC.replaceInstUsesWith(II, NewMaskedLoad); 80 } 81 82 return nullptr; 83 } 84 85 // TODO: If the x86 backend knew how to convert a bool vector mask back to an 86 // XMM register mask efficiently, we could transform all x86 masked intrinsics 87 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs. 88 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) { 89 Value *Ptr = II.getOperand(0); 90 Value *Mask = II.getOperand(1); 91 Value *Vec = II.getOperand(2); 92 93 // Zero Mask - this masked store instruction does nothing. 94 if (isa<ConstantAggregateZero>(Mask)) { 95 IC.eraseInstFromFunction(II); 96 return true; 97 } 98 99 // The SSE2 version is too weird (eg, unaligned but non-temporal) to do 100 // anything else at this level. 101 if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu) 102 return false; 103 104 // The mask is constant or extended from a bool vector. Convert this x86 105 // intrinsic to the LLVM intrinsic to allow target-independent optimizations. 106 if (Value *BoolMask = getBoolVecFromMask(Mask, IC.getDataLayout())) { 107 unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace(); 108 PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace); 109 Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec"); 110 111 IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask); 112 113 // 'Replace uses' doesn't work for stores. Erase the original masked store. 114 IC.eraseInstFromFunction(II); 115 return true; 116 } 117 118 return false; 119 } 120 121 static Value *simplifyX86immShift(const IntrinsicInst &II, 122 InstCombiner::BuilderTy &Builder) { 123 bool LogicalShift = false; 124 bool ShiftLeft = false; 125 bool IsImm = false; 126 127 switch (II.getIntrinsicID()) { 128 default: 129 llvm_unreachable("Unexpected intrinsic!"); 130 case Intrinsic::x86_sse2_psrai_d: 131 case Intrinsic::x86_sse2_psrai_w: 132 case Intrinsic::x86_avx2_psrai_d: 133 case Intrinsic::x86_avx2_psrai_w: 134 case Intrinsic::x86_avx512_psrai_q_128: 135 case Intrinsic::x86_avx512_psrai_q_256: 136 case Intrinsic::x86_avx512_psrai_d_512: 137 case Intrinsic::x86_avx512_psrai_q_512: 138 case Intrinsic::x86_avx512_psrai_w_512: 139 IsImm = true; 140 [[fallthrough]]; 141 case Intrinsic::x86_sse2_psra_d: 142 case Intrinsic::x86_sse2_psra_w: 143 case Intrinsic::x86_avx2_psra_d: 144 case Intrinsic::x86_avx2_psra_w: 145 case Intrinsic::x86_avx512_psra_q_128: 146 case Intrinsic::x86_avx512_psra_q_256: 147 case Intrinsic::x86_avx512_psra_d_512: 148 case Intrinsic::x86_avx512_psra_q_512: 149 case Intrinsic::x86_avx512_psra_w_512: 150 LogicalShift = false; 151 ShiftLeft = false; 152 break; 153 case Intrinsic::x86_sse2_psrli_d: 154 case Intrinsic::x86_sse2_psrli_q: 155 case Intrinsic::x86_sse2_psrli_w: 156 case Intrinsic::x86_avx2_psrli_d: 157 case Intrinsic::x86_avx2_psrli_q: 158 case Intrinsic::x86_avx2_psrli_w: 159 case Intrinsic::x86_avx512_psrli_d_512: 160 case Intrinsic::x86_avx512_psrli_q_512: 161 case Intrinsic::x86_avx512_psrli_w_512: 162 IsImm = true; 163 [[fallthrough]]; 164 case Intrinsic::x86_sse2_psrl_d: 165 case Intrinsic::x86_sse2_psrl_q: 166 case Intrinsic::x86_sse2_psrl_w: 167 case Intrinsic::x86_avx2_psrl_d: 168 case Intrinsic::x86_avx2_psrl_q: 169 case Intrinsic::x86_avx2_psrl_w: 170 case Intrinsic::x86_avx512_psrl_d_512: 171 case Intrinsic::x86_avx512_psrl_q_512: 172 case Intrinsic::x86_avx512_psrl_w_512: 173 LogicalShift = true; 174 ShiftLeft = false; 175 break; 176 case Intrinsic::x86_sse2_pslli_d: 177 case Intrinsic::x86_sse2_pslli_q: 178 case Intrinsic::x86_sse2_pslli_w: 179 case Intrinsic::x86_avx2_pslli_d: 180 case Intrinsic::x86_avx2_pslli_q: 181 case Intrinsic::x86_avx2_pslli_w: 182 case Intrinsic::x86_avx512_pslli_d_512: 183 case Intrinsic::x86_avx512_pslli_q_512: 184 case Intrinsic::x86_avx512_pslli_w_512: 185 IsImm = true; 186 [[fallthrough]]; 187 case Intrinsic::x86_sse2_psll_d: 188 case Intrinsic::x86_sse2_psll_q: 189 case Intrinsic::x86_sse2_psll_w: 190 case Intrinsic::x86_avx2_psll_d: 191 case Intrinsic::x86_avx2_psll_q: 192 case Intrinsic::x86_avx2_psll_w: 193 case Intrinsic::x86_avx512_psll_d_512: 194 case Intrinsic::x86_avx512_psll_q_512: 195 case Intrinsic::x86_avx512_psll_w_512: 196 LogicalShift = true; 197 ShiftLeft = true; 198 break; 199 } 200 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 201 202 Value *Vec = II.getArgOperand(0); 203 Value *Amt = II.getArgOperand(1); 204 auto *VT = cast<FixedVectorType>(Vec->getType()); 205 Type *SVT = VT->getElementType(); 206 Type *AmtVT = Amt->getType(); 207 unsigned VWidth = VT->getNumElements(); 208 unsigned BitWidth = SVT->getPrimitiveSizeInBits(); 209 210 // If the shift amount is guaranteed to be in-range we can replace it with a 211 // generic shift. If its guaranteed to be out of range, logical shifts combine 212 // to zero and arithmetic shifts are clamped to (BitWidth - 1). 213 if (IsImm) { 214 assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type"); 215 KnownBits KnownAmtBits = 216 llvm::computeKnownBits(Amt, II.getDataLayout()); 217 if (KnownAmtBits.getMaxValue().ult(BitWidth)) { 218 Amt = Builder.CreateZExtOrTrunc(Amt, SVT); 219 Amt = Builder.CreateVectorSplat(VWidth, Amt); 220 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 221 : Builder.CreateLShr(Vec, Amt)) 222 : Builder.CreateAShr(Vec, Amt)); 223 } 224 if (KnownAmtBits.getMinValue().uge(BitWidth)) { 225 if (LogicalShift) 226 return ConstantAggregateZero::get(VT); 227 Amt = ConstantInt::get(SVT, BitWidth - 1); 228 return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); 229 } 230 } else { 231 // Ensure the first element has an in-range value and the rest of the 232 // elements in the bottom 64 bits are zero. 233 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 234 cast<VectorType>(AmtVT)->getElementType() == SVT && 235 "Unexpected shift-by-scalar type"); 236 unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements(); 237 APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); 238 APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); 239 KnownBits KnownLowerBits = llvm::computeKnownBits( 240 Amt, DemandedLower, II.getDataLayout()); 241 KnownBits KnownUpperBits = llvm::computeKnownBits( 242 Amt, DemandedUpper, II.getDataLayout()); 243 if (KnownLowerBits.getMaxValue().ult(BitWidth) && 244 (DemandedUpper.isZero() || KnownUpperBits.isZero())) { 245 SmallVector<int, 16> ZeroSplat(VWidth, 0); 246 Amt = Builder.CreateShuffleVector(Amt, ZeroSplat); 247 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 248 : Builder.CreateLShr(Vec, Amt)) 249 : Builder.CreateAShr(Vec, Amt)); 250 } 251 } 252 253 // Simplify if count is constant vector. 254 auto *CDV = dyn_cast<ConstantDataVector>(Amt); 255 if (!CDV) 256 return nullptr; 257 258 // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector 259 // operand to compute the shift amount. 260 assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && 261 cast<VectorType>(AmtVT)->getElementType() == SVT && 262 "Unexpected shift-by-scalar type"); 263 264 // Concatenate the sub-elements to create the 64-bit value. 265 APInt Count(64, 0); 266 for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { 267 unsigned SubEltIdx = (NumSubElts - 1) - i; 268 auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx)); 269 Count <<= BitWidth; 270 Count |= SubElt->getValue().zextOrTrunc(64); 271 } 272 273 // If shift-by-zero then just return the original value. 274 if (Count.isZero()) 275 return Vec; 276 277 // Handle cases when Shift >= BitWidth. 278 if (Count.uge(BitWidth)) { 279 // If LogicalShift - just return zero. 280 if (LogicalShift) 281 return ConstantAggregateZero::get(VT); 282 283 // If ArithmeticShift - clamp Shift to (BitWidth - 1). 284 Count = APInt(64, BitWidth - 1); 285 } 286 287 // Get a constant vector of the same type as the first operand. 288 auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth)); 289 auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt); 290 291 if (ShiftLeft) 292 return Builder.CreateShl(Vec, ShiftVec); 293 294 if (LogicalShift) 295 return Builder.CreateLShr(Vec, ShiftVec); 296 297 return Builder.CreateAShr(Vec, ShiftVec); 298 } 299 300 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift. 301 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out 302 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit). 303 static Value *simplifyX86varShift(const IntrinsicInst &II, 304 InstCombiner::BuilderTy &Builder) { 305 bool LogicalShift = false; 306 bool ShiftLeft = false; 307 308 switch (II.getIntrinsicID()) { 309 default: 310 llvm_unreachable("Unexpected intrinsic!"); 311 case Intrinsic::x86_avx2_psrav_d: 312 case Intrinsic::x86_avx2_psrav_d_256: 313 case Intrinsic::x86_avx512_psrav_q_128: 314 case Intrinsic::x86_avx512_psrav_q_256: 315 case Intrinsic::x86_avx512_psrav_d_512: 316 case Intrinsic::x86_avx512_psrav_q_512: 317 case Intrinsic::x86_avx512_psrav_w_128: 318 case Intrinsic::x86_avx512_psrav_w_256: 319 case Intrinsic::x86_avx512_psrav_w_512: 320 LogicalShift = false; 321 ShiftLeft = false; 322 break; 323 case Intrinsic::x86_avx2_psrlv_d: 324 case Intrinsic::x86_avx2_psrlv_d_256: 325 case Intrinsic::x86_avx2_psrlv_q: 326 case Intrinsic::x86_avx2_psrlv_q_256: 327 case Intrinsic::x86_avx512_psrlv_d_512: 328 case Intrinsic::x86_avx512_psrlv_q_512: 329 case Intrinsic::x86_avx512_psrlv_w_128: 330 case Intrinsic::x86_avx512_psrlv_w_256: 331 case Intrinsic::x86_avx512_psrlv_w_512: 332 LogicalShift = true; 333 ShiftLeft = false; 334 break; 335 case Intrinsic::x86_avx2_psllv_d: 336 case Intrinsic::x86_avx2_psllv_d_256: 337 case Intrinsic::x86_avx2_psllv_q: 338 case Intrinsic::x86_avx2_psllv_q_256: 339 case Intrinsic::x86_avx512_psllv_d_512: 340 case Intrinsic::x86_avx512_psllv_q_512: 341 case Intrinsic::x86_avx512_psllv_w_128: 342 case Intrinsic::x86_avx512_psllv_w_256: 343 case Intrinsic::x86_avx512_psllv_w_512: 344 LogicalShift = true; 345 ShiftLeft = true; 346 break; 347 } 348 assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); 349 350 Value *Vec = II.getArgOperand(0); 351 Value *Amt = II.getArgOperand(1); 352 auto *VT = cast<FixedVectorType>(II.getType()); 353 Type *SVT = VT->getElementType(); 354 int NumElts = VT->getNumElements(); 355 int BitWidth = SVT->getIntegerBitWidth(); 356 357 // If the shift amount is guaranteed to be in-range we can replace it with a 358 // generic shift. 359 KnownBits KnownAmt = 360 llvm::computeKnownBits(Amt, II.getDataLayout()); 361 if (KnownAmt.getMaxValue().ult(BitWidth)) { 362 return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) 363 : Builder.CreateLShr(Vec, Amt)) 364 : Builder.CreateAShr(Vec, Amt)); 365 } 366 367 // Simplify if all shift amounts are constant/undef. 368 auto *CShift = dyn_cast<Constant>(Amt); 369 if (!CShift) 370 return nullptr; 371 372 // Collect each element's shift amount. 373 // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth. 374 bool AnyOutOfRange = false; 375 SmallVector<int, 8> ShiftAmts; 376 for (int I = 0; I < NumElts; ++I) { 377 auto *CElt = CShift->getAggregateElement(I); 378 if (isa_and_nonnull<UndefValue>(CElt)) { 379 ShiftAmts.push_back(-1); 380 continue; 381 } 382 383 auto *COp = dyn_cast_or_null<ConstantInt>(CElt); 384 if (!COp) 385 return nullptr; 386 387 // Handle out of range shifts. 388 // If LogicalShift - set to BitWidth (special case). 389 // If ArithmeticShift - set to (BitWidth - 1) (sign splat). 390 APInt ShiftVal = COp->getValue(); 391 if (ShiftVal.uge(BitWidth)) { 392 AnyOutOfRange = LogicalShift; 393 ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1); 394 continue; 395 } 396 397 ShiftAmts.push_back((int)ShiftVal.getZExtValue()); 398 } 399 400 // If all elements out of range or UNDEF, return vector of zeros/undefs. 401 // ArithmeticShift should only hit this if they are all UNDEF. 402 auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); }; 403 if (llvm::all_of(ShiftAmts, OutOfRange)) { 404 SmallVector<Constant *, 8> ConstantVec; 405 for (int Idx : ShiftAmts) { 406 if (Idx < 0) { 407 ConstantVec.push_back(UndefValue::get(SVT)); 408 } else { 409 assert(LogicalShift && "Logical shift expected"); 410 ConstantVec.push_back(ConstantInt::getNullValue(SVT)); 411 } 412 } 413 return ConstantVector::get(ConstantVec); 414 } 415 416 // We can't handle only some out of range values with generic logical shifts. 417 if (AnyOutOfRange) 418 return nullptr; 419 420 // Build the shift amount constant vector. 421 SmallVector<Constant *, 8> ShiftVecAmts; 422 for (int Idx : ShiftAmts) { 423 if (Idx < 0) 424 ShiftVecAmts.push_back(UndefValue::get(SVT)); 425 else 426 ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx)); 427 } 428 auto ShiftVec = ConstantVector::get(ShiftVecAmts); 429 430 if (ShiftLeft) 431 return Builder.CreateShl(Vec, ShiftVec); 432 433 if (LogicalShift) 434 return Builder.CreateLShr(Vec, ShiftVec); 435 436 return Builder.CreateAShr(Vec, ShiftVec); 437 } 438 439 static Value *simplifyX86pack(IntrinsicInst &II, 440 InstCombiner::BuilderTy &Builder, bool IsSigned) { 441 Value *Arg0 = II.getArgOperand(0); 442 Value *Arg1 = II.getArgOperand(1); 443 Type *ResTy = II.getType(); 444 445 // Fast all undef handling. 446 if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) 447 return UndefValue::get(ResTy); 448 449 auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 450 unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; 451 unsigned NumSrcElts = ArgTy->getNumElements(); 452 assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) && 453 "Unexpected packing types"); 454 455 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; 456 unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); 457 unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); 458 assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && 459 "Unexpected packing types"); 460 461 // Constant folding. 462 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 463 return nullptr; 464 465 // Clamp Values - signed/unsigned both use signed clamp values, but they 466 // differ on the min/max values. 467 APInt MinValue, MaxValue; 468 if (IsSigned) { 469 // PACKSS: Truncate signed value with signed saturation. 470 // Source values less than dst minint are saturated to minint. 471 // Source values greater than dst maxint are saturated to maxint. 472 MinValue = 473 APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 474 MaxValue = 475 APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits); 476 } else { 477 // PACKUS: Truncate signed value with unsigned saturation. 478 // Source values less than zero are saturated to zero. 479 // Source values greater than dst maxuint are saturated to maxuint. 480 MinValue = APInt::getZero(SrcScalarSizeInBits); 481 MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); 482 } 483 484 auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); 485 auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); 486 Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); 487 Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); 488 Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); 489 Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); 490 491 // Shuffle clamped args together at the lane level. 492 SmallVector<int, 32> PackMask; 493 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 494 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 495 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane)); 496 for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt) 497 PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts); 498 } 499 auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask); 500 501 // Truncate to dst size. 502 return Builder.CreateTrunc(Shuffle, ResTy); 503 } 504 505 static Value *simplifyX86pmadd(IntrinsicInst &II, 506 InstCombiner::BuilderTy &Builder, 507 bool IsPMADDWD) { 508 Value *Arg0 = II.getArgOperand(0); 509 Value *Arg1 = II.getArgOperand(1); 510 auto *ResTy = cast<FixedVectorType>(II.getType()); 511 [[maybe_unused]] auto *ArgTy = cast<FixedVectorType>(Arg0->getType()); 512 513 unsigned NumDstElts = ResTy->getNumElements(); 514 assert(ArgTy->getNumElements() == (2 * NumDstElts) && 515 ResTy->getScalarSizeInBits() == (2 * ArgTy->getScalarSizeInBits()) && 516 "Unexpected PMADD types"); 517 518 // Multiply by undef -> zero (NOT undef!) as other arg could still be zero. 519 if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1)) 520 return ConstantAggregateZero::get(ResTy); 521 522 // Multiply by zero. 523 if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) 524 return ConstantAggregateZero::get(ResTy); 525 526 // Constant folding. 527 if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1)) 528 return nullptr; 529 530 // Split Lo/Hi elements pairs, extend and add together. 531 // PMADDWD(X,Y) = 532 // add(mul(sext(lhs[0]),sext(rhs[0])),mul(sext(lhs[1]),sext(rhs[1]))) 533 // PMADDUBSW(X,Y) = 534 // sadd_sat(mul(zext(lhs[0]),sext(rhs[0])),mul(zext(lhs[1]),sext(rhs[1]))) 535 SmallVector<int> LoMask, HiMask; 536 for (unsigned I = 0; I != NumDstElts; ++I) { 537 LoMask.push_back(2 * I + 0); 538 HiMask.push_back(2 * I + 1); 539 } 540 541 auto *LHSLo = Builder.CreateShuffleVector(Arg0, LoMask); 542 auto *LHSHi = Builder.CreateShuffleVector(Arg0, HiMask); 543 auto *RHSLo = Builder.CreateShuffleVector(Arg1, LoMask); 544 auto *RHSHi = Builder.CreateShuffleVector(Arg1, HiMask); 545 546 auto LHSCast = 547 IsPMADDWD ? Instruction::CastOps::SExt : Instruction::CastOps::ZExt; 548 LHSLo = Builder.CreateCast(LHSCast, LHSLo, ResTy); 549 LHSHi = Builder.CreateCast(LHSCast, LHSHi, ResTy); 550 RHSLo = Builder.CreateCast(Instruction::CastOps::SExt, RHSLo, ResTy); 551 RHSHi = Builder.CreateCast(Instruction::CastOps::SExt, RHSHi, ResTy); 552 Value *Lo = Builder.CreateMul(LHSLo, RHSLo); 553 Value *Hi = Builder.CreateMul(LHSHi, RHSHi); 554 return IsPMADDWD 555 ? Builder.CreateAdd(Lo, Hi) 556 : Builder.CreateIntrinsic(ResTy, Intrinsic::sadd_sat, {Lo, Hi}); 557 } 558 559 static Value *simplifyX86movmsk(const IntrinsicInst &II, 560 InstCombiner::BuilderTy &Builder) { 561 Value *Arg = II.getArgOperand(0); 562 Type *ResTy = II.getType(); 563 564 // movmsk(undef) -> zero as we must ensure the upper bits are zero. 565 if (isa<UndefValue>(Arg)) 566 return Constant::getNullValue(ResTy); 567 568 auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType()); 569 // We can't easily peek through x86_mmx types. 570 if (!ArgTy) 571 return nullptr; 572 573 // Expand MOVMSK to compare/bitcast/zext: 574 // e.g. PMOVMSKB(v16i8 x): 575 // %cmp = icmp slt <16 x i8> %x, zeroinitializer 576 // %int = bitcast <16 x i1> %cmp to i16 577 // %res = zext i16 %int to i32 578 unsigned NumElts = ArgTy->getNumElements(); 579 Type *IntegerTy = Builder.getIntNTy(NumElts); 580 581 Value *Res = Builder.CreateBitCast(Arg, VectorType::getInteger(ArgTy)); 582 Res = Builder.CreateIsNeg(Res); 583 Res = Builder.CreateBitCast(Res, IntegerTy); 584 Res = Builder.CreateZExtOrTrunc(Res, ResTy); 585 return Res; 586 } 587 588 static Value *simplifyX86addcarry(const IntrinsicInst &II, 589 InstCombiner::BuilderTy &Builder) { 590 Value *CarryIn = II.getArgOperand(0); 591 Value *Op1 = II.getArgOperand(1); 592 Value *Op2 = II.getArgOperand(2); 593 Type *RetTy = II.getType(); 594 Type *OpTy = Op1->getType(); 595 assert(RetTy->getStructElementType(0)->isIntegerTy(8) && 596 RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() && 597 "Unexpected types for x86 addcarry"); 598 599 // If carry-in is zero, this is just an unsigned add with overflow. 600 if (match(CarryIn, PatternMatch::m_ZeroInt())) { 601 Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy, 602 {Op1, Op2}); 603 // The types have to be adjusted to match the x86 call types. 604 Value *UAddResult = Builder.CreateExtractValue(UAdd, 0); 605 Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1), 606 Builder.getInt8Ty()); 607 Value *Res = PoisonValue::get(RetTy); 608 Res = Builder.CreateInsertValue(Res, UAddOV, 0); 609 return Builder.CreateInsertValue(Res, UAddResult, 1); 610 } 611 612 return nullptr; 613 } 614 615 static Value *simplifyTernarylogic(const IntrinsicInst &II, 616 InstCombiner::BuilderTy &Builder) { 617 618 auto *ArgImm = dyn_cast<ConstantInt>(II.getArgOperand(3)); 619 if (!ArgImm || ArgImm->getValue().uge(256)) 620 return nullptr; 621 622 Value *ArgA = II.getArgOperand(0); 623 Value *ArgB = II.getArgOperand(1); 624 Value *ArgC = II.getArgOperand(2); 625 626 Type *Ty = II.getType(); 627 628 auto Or = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 629 return {Builder.CreateOr(Lhs.first, Rhs.first), Lhs.second | Rhs.second}; 630 }; 631 auto Xor = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 632 return {Builder.CreateXor(Lhs.first, Rhs.first), Lhs.second ^ Rhs.second}; 633 }; 634 auto And = [&](auto Lhs, auto Rhs) -> std::pair<Value *, uint8_t> { 635 return {Builder.CreateAnd(Lhs.first, Rhs.first), Lhs.second & Rhs.second}; 636 }; 637 auto Not = [&](auto V) -> std::pair<Value *, uint8_t> { 638 return {Builder.CreateNot(V.first), ~V.second}; 639 }; 640 auto Nor = [&](auto Lhs, auto Rhs) { return Not(Or(Lhs, Rhs)); }; 641 auto Xnor = [&](auto Lhs, auto Rhs) { return Not(Xor(Lhs, Rhs)); }; 642 auto Nand = [&](auto Lhs, auto Rhs) { return Not(And(Lhs, Rhs)); }; 643 644 bool AIsConst = match(ArgA, PatternMatch::m_ImmConstant()); 645 bool BIsConst = match(ArgB, PatternMatch::m_ImmConstant()); 646 bool CIsConst = match(ArgC, PatternMatch::m_ImmConstant()); 647 648 bool ABIsConst = AIsConst && BIsConst; 649 bool ACIsConst = AIsConst && CIsConst; 650 bool BCIsConst = BIsConst && CIsConst; 651 bool ABCIsConst = AIsConst && BIsConst && CIsConst; 652 653 // Use for verification. Its a big table. Its difficult to go from Imm -> 654 // logic ops, but easy to verify that a set of logic ops is correct. We track 655 // the logic ops through the second value in the pair. At the end it should 656 // equal Imm. 657 std::pair<Value *, uint8_t> A = {ArgA, 0xf0}; 658 std::pair<Value *, uint8_t> B = {ArgB, 0xcc}; 659 std::pair<Value *, uint8_t> C = {ArgC, 0xaa}; 660 std::pair<Value *, uint8_t> Res = {nullptr, 0}; 661 662 // Currently we only handle cases that convert directly to another instruction 663 // or cases where all the ops are constant. This is because we don't properly 664 // handle creating ternary ops in the backend, so splitting them here may 665 // cause regressions. As the backend improves, uncomment more cases. 666 667 uint8_t Imm = ArgImm->getValue().getZExtValue(); 668 switch (Imm) { 669 case 0x0: 670 Res = {Constant::getNullValue(Ty), 0}; 671 break; 672 case 0x1: 673 if (ABCIsConst) 674 Res = Nor(Or(A, B), C); 675 break; 676 case 0x2: 677 if (ABCIsConst) 678 Res = And(Nor(A, B), C); 679 break; 680 case 0x3: 681 if (ABIsConst) 682 Res = Nor(A, B); 683 break; 684 case 0x4: 685 if (ABCIsConst) 686 Res = And(Nor(A, C), B); 687 break; 688 case 0x5: 689 if (ACIsConst) 690 Res = Nor(A, C); 691 break; 692 case 0x6: 693 if (ABCIsConst) 694 Res = Nor(A, Xnor(B, C)); 695 break; 696 case 0x7: 697 if (ABCIsConst) 698 Res = Nor(A, And(B, C)); 699 break; 700 case 0x8: 701 if (ABCIsConst) 702 Res = Nor(A, Nand(B, C)); 703 break; 704 case 0x9: 705 if (ABCIsConst) 706 Res = Nor(A, Xor(B, C)); 707 break; 708 case 0xa: 709 if (ACIsConst) 710 Res = Nor(A, Not(C)); 711 break; 712 case 0xb: 713 if (ABCIsConst) 714 Res = Nor(A, Nor(C, Not(B))); 715 break; 716 case 0xc: 717 if (ABIsConst) 718 Res = Nor(A, Not(B)); 719 break; 720 case 0xd: 721 if (ABCIsConst) 722 Res = Nor(A, Nor(B, Not(C))); 723 break; 724 case 0xe: 725 if (ABCIsConst) 726 Res = Nor(A, Nor(B, C)); 727 break; 728 case 0xf: 729 Res = Not(A); 730 break; 731 case 0x10: 732 if (ABCIsConst) 733 Res = And(A, Nor(B, C)); 734 break; 735 case 0x11: 736 if (BCIsConst) 737 Res = Nor(B, C); 738 break; 739 case 0x12: 740 if (ABCIsConst) 741 Res = Nor(Xnor(A, C), B); 742 break; 743 case 0x13: 744 if (ABCIsConst) 745 Res = Nor(And(A, C), B); 746 break; 747 case 0x14: 748 if (ABCIsConst) 749 Res = Nor(Xnor(A, B), C); 750 break; 751 case 0x15: 752 if (ABCIsConst) 753 Res = Nor(And(A, B), C); 754 break; 755 case 0x16: 756 if (ABCIsConst) 757 Res = Xor(Xor(A, B), And(Nand(A, B), C)); 758 break; 759 case 0x17: 760 if (ABCIsConst) 761 Res = Xor(Or(A, B), Or(Xnor(A, B), C)); 762 break; 763 case 0x18: 764 if (ABCIsConst) 765 Res = Nor(Xnor(A, B), Xnor(A, C)); 766 break; 767 case 0x19: 768 if (ABCIsConst) 769 Res = And(Nand(A, B), Xnor(B, C)); 770 break; 771 case 0x1a: 772 if (ABCIsConst) 773 Res = Xor(A, Or(And(A, B), C)); 774 break; 775 case 0x1b: 776 if (ABCIsConst) 777 Res = Xor(A, Or(Xnor(A, B), C)); 778 break; 779 case 0x1c: 780 if (ABCIsConst) 781 Res = Xor(A, Or(And(A, C), B)); 782 break; 783 case 0x1d: 784 if (ABCIsConst) 785 Res = Xor(A, Or(Xnor(A, C), B)); 786 break; 787 case 0x1e: 788 if (ABCIsConst) 789 Res = Xor(A, Or(B, C)); 790 break; 791 case 0x1f: 792 if (ABCIsConst) 793 Res = Nand(A, Or(B, C)); 794 break; 795 case 0x20: 796 if (ABCIsConst) 797 Res = Nor(Nand(A, C), B); 798 break; 799 case 0x21: 800 if (ABCIsConst) 801 Res = Nor(Xor(A, C), B); 802 break; 803 case 0x22: 804 if (BCIsConst) 805 Res = Nor(B, Not(C)); 806 break; 807 case 0x23: 808 if (ABCIsConst) 809 Res = Nor(B, Nor(C, Not(A))); 810 break; 811 case 0x24: 812 if (ABCIsConst) 813 Res = Nor(Xnor(A, B), Xor(A, C)); 814 break; 815 case 0x25: 816 if (ABCIsConst) 817 Res = Xor(A, Nand(Nand(A, B), C)); 818 break; 819 case 0x26: 820 if (ABCIsConst) 821 Res = And(Nand(A, B), Xor(B, C)); 822 break; 823 case 0x27: 824 if (ABCIsConst) 825 Res = Xor(Or(Xnor(A, B), C), B); 826 break; 827 case 0x28: 828 if (ABCIsConst) 829 Res = And(Xor(A, B), C); 830 break; 831 case 0x29: 832 if (ABCIsConst) 833 Res = Xor(Xor(A, B), Nor(And(A, B), C)); 834 break; 835 case 0x2a: 836 if (ABCIsConst) 837 Res = And(Nand(A, B), C); 838 break; 839 case 0x2b: 840 if (ABCIsConst) 841 Res = Xor(Or(Xnor(A, B), Xor(A, C)), A); 842 break; 843 case 0x2c: 844 if (ABCIsConst) 845 Res = Nor(Xnor(A, B), Nor(B, C)); 846 break; 847 case 0x2d: 848 if (ABCIsConst) 849 Res = Xor(A, Or(B, Not(C))); 850 break; 851 case 0x2e: 852 if (ABCIsConst) 853 Res = Xor(A, Or(Xor(A, C), B)); 854 break; 855 case 0x2f: 856 if (ABCIsConst) 857 Res = Nand(A, Or(B, Not(C))); 858 break; 859 case 0x30: 860 if (ABIsConst) 861 Res = Nor(B, Not(A)); 862 break; 863 case 0x31: 864 if (ABCIsConst) 865 Res = Nor(Nor(A, Not(C)), B); 866 break; 867 case 0x32: 868 if (ABCIsConst) 869 Res = Nor(Nor(A, C), B); 870 break; 871 case 0x33: 872 Res = Not(B); 873 break; 874 case 0x34: 875 if (ABCIsConst) 876 Res = And(Xor(A, B), Nand(B, C)); 877 break; 878 case 0x35: 879 if (ABCIsConst) 880 Res = Xor(B, Or(A, Xnor(B, C))); 881 break; 882 case 0x36: 883 if (ABCIsConst) 884 Res = Xor(Or(A, C), B); 885 break; 886 case 0x37: 887 if (ABCIsConst) 888 Res = Nand(Or(A, C), B); 889 break; 890 case 0x38: 891 if (ABCIsConst) 892 Res = Nor(Xnor(A, B), Nor(A, C)); 893 break; 894 case 0x39: 895 if (ABCIsConst) 896 Res = Xor(Or(A, Not(C)), B); 897 break; 898 case 0x3a: 899 if (ABCIsConst) 900 Res = Xor(B, Or(A, Xor(B, C))); 901 break; 902 case 0x3b: 903 if (ABCIsConst) 904 Res = Nand(Or(A, Not(C)), B); 905 break; 906 case 0x3c: 907 Res = Xor(A, B); 908 break; 909 case 0x3d: 910 if (ABCIsConst) 911 Res = Xor(A, Or(Nor(A, C), B)); 912 break; 913 case 0x3e: 914 if (ABCIsConst) 915 Res = Xor(A, Or(Nor(A, Not(C)), B)); 916 break; 917 case 0x3f: 918 if (ABIsConst) 919 Res = Nand(A, B); 920 break; 921 case 0x40: 922 if (ABCIsConst) 923 Res = Nor(Nand(A, B), C); 924 break; 925 case 0x41: 926 if (ABCIsConst) 927 Res = Nor(Xor(A, B), C); 928 break; 929 case 0x42: 930 if (ABCIsConst) 931 Res = Nor(Xor(A, B), Xnor(A, C)); 932 break; 933 case 0x43: 934 if (ABCIsConst) 935 Res = Xor(A, Nand(Nand(A, C), B)); 936 break; 937 case 0x44: 938 if (BCIsConst) 939 Res = Nor(C, Not(B)); 940 break; 941 case 0x45: 942 if (ABCIsConst) 943 Res = Nor(Nor(B, Not(A)), C); 944 break; 945 case 0x46: 946 if (ABCIsConst) 947 Res = Xor(Or(And(A, C), B), C); 948 break; 949 case 0x47: 950 if (ABCIsConst) 951 Res = Xor(Or(Xnor(A, C), B), C); 952 break; 953 case 0x48: 954 if (ABCIsConst) 955 Res = And(Xor(A, C), B); 956 break; 957 case 0x49: 958 if (ABCIsConst) 959 Res = Xor(Or(Xnor(A, B), And(A, C)), C); 960 break; 961 case 0x4a: 962 if (ABCIsConst) 963 Res = Nor(Xnor(A, C), Nor(B, C)); 964 break; 965 case 0x4b: 966 if (ABCIsConst) 967 Res = Xor(A, Or(C, Not(B))); 968 break; 969 case 0x4c: 970 if (ABCIsConst) 971 Res = And(Nand(A, C), B); 972 break; 973 case 0x4d: 974 if (ABCIsConst) 975 Res = Xor(Or(Xor(A, B), Xnor(A, C)), A); 976 break; 977 case 0x4e: 978 if (ABCIsConst) 979 Res = Xor(A, Or(Xor(A, B), C)); 980 break; 981 case 0x4f: 982 if (ABCIsConst) 983 Res = Nand(A, Nand(B, Not(C))); 984 break; 985 case 0x50: 986 if (ACIsConst) 987 Res = Nor(C, Not(A)); 988 break; 989 case 0x51: 990 if (ABCIsConst) 991 Res = Nor(Nor(A, Not(B)), C); 992 break; 993 case 0x52: 994 if (ABCIsConst) 995 Res = And(Xor(A, C), Nand(B, C)); 996 break; 997 case 0x53: 998 if (ABCIsConst) 999 Res = Xor(Or(Xnor(B, C), A), C); 1000 break; 1001 case 0x54: 1002 if (ABCIsConst) 1003 Res = Nor(Nor(A, B), C); 1004 break; 1005 case 0x55: 1006 Res = Not(C); 1007 break; 1008 case 0x56: 1009 if (ABCIsConst) 1010 Res = Xor(Or(A, B), C); 1011 break; 1012 case 0x57: 1013 if (ABCIsConst) 1014 Res = Nand(Or(A, B), C); 1015 break; 1016 case 0x58: 1017 if (ABCIsConst) 1018 Res = Nor(Nor(A, B), Xnor(A, C)); 1019 break; 1020 case 0x59: 1021 if (ABCIsConst) 1022 Res = Xor(Or(A, Not(B)), C); 1023 break; 1024 case 0x5a: 1025 Res = Xor(A, C); 1026 break; 1027 case 0x5b: 1028 if (ABCIsConst) 1029 Res = Xor(A, Or(Nor(A, B), C)); 1030 break; 1031 case 0x5c: 1032 if (ABCIsConst) 1033 Res = Xor(Or(Xor(B, C), A), C); 1034 break; 1035 case 0x5d: 1036 if (ABCIsConst) 1037 Res = Nand(Or(A, Not(B)), C); 1038 break; 1039 case 0x5e: 1040 if (ABCIsConst) 1041 Res = Xor(A, Or(Nor(A, Not(B)), C)); 1042 break; 1043 case 0x5f: 1044 if (ACIsConst) 1045 Res = Nand(A, C); 1046 break; 1047 case 0x60: 1048 if (ABCIsConst) 1049 Res = And(A, Xor(B, C)); 1050 break; 1051 case 0x61: 1052 if (ABCIsConst) 1053 Res = Xor(Or(Xnor(A, B), And(B, C)), C); 1054 break; 1055 case 0x62: 1056 if (ABCIsConst) 1057 Res = Nor(Nor(A, C), Xnor(B, C)); 1058 break; 1059 case 0x63: 1060 if (ABCIsConst) 1061 Res = Xor(B, Or(C, Not(A))); 1062 break; 1063 case 0x64: 1064 if (ABCIsConst) 1065 Res = Nor(Nor(A, B), Xnor(B, C)); 1066 break; 1067 case 0x65: 1068 if (ABCIsConst) 1069 Res = Xor(Or(B, Not(A)), C); 1070 break; 1071 case 0x66: 1072 Res = Xor(B, C); 1073 break; 1074 case 0x67: 1075 if (ABCIsConst) 1076 Res = Or(Nor(A, B), Xor(B, C)); 1077 break; 1078 case 0x68: 1079 if (ABCIsConst) 1080 Res = Xor(Xor(A, B), Nor(Nor(A, B), C)); 1081 break; 1082 case 0x69: 1083 if (ABCIsConst) 1084 Res = Xor(Xnor(A, B), C); 1085 break; 1086 case 0x6a: 1087 if (ABCIsConst) 1088 Res = Xor(And(A, B), C); 1089 break; 1090 case 0x6b: 1091 if (ABCIsConst) 1092 Res = Or(Nor(A, B), Xor(Xnor(A, B), C)); 1093 break; 1094 case 0x6c: 1095 if (ABCIsConst) 1096 Res = Xor(And(A, C), B); 1097 break; 1098 case 0x6d: 1099 if (ABCIsConst) 1100 Res = Xor(Or(Xnor(A, B), Nor(A, C)), C); 1101 break; 1102 case 0x6e: 1103 if (ABCIsConst) 1104 Res = Or(Nor(A, Not(B)), Xor(B, C)); 1105 break; 1106 case 0x6f: 1107 if (ABCIsConst) 1108 Res = Nand(A, Xnor(B, C)); 1109 break; 1110 case 0x70: 1111 if (ABCIsConst) 1112 Res = And(A, Nand(B, C)); 1113 break; 1114 case 0x71: 1115 if (ABCIsConst) 1116 Res = Xor(Nor(Xor(A, B), Xor(A, C)), A); 1117 break; 1118 case 0x72: 1119 if (ABCIsConst) 1120 Res = Xor(Or(Xor(A, B), C), B); 1121 break; 1122 case 0x73: 1123 if (ABCIsConst) 1124 Res = Nand(Nand(A, Not(C)), B); 1125 break; 1126 case 0x74: 1127 if (ABCIsConst) 1128 Res = Xor(Or(Xor(A, C), B), C); 1129 break; 1130 case 0x75: 1131 if (ABCIsConst) 1132 Res = Nand(Nand(A, Not(B)), C); 1133 break; 1134 case 0x76: 1135 if (ABCIsConst) 1136 Res = Xor(B, Or(Nor(B, Not(A)), C)); 1137 break; 1138 case 0x77: 1139 if (BCIsConst) 1140 Res = Nand(B, C); 1141 break; 1142 case 0x78: 1143 if (ABCIsConst) 1144 Res = Xor(A, And(B, C)); 1145 break; 1146 case 0x79: 1147 if (ABCIsConst) 1148 Res = Xor(Or(Xnor(A, B), Nor(B, C)), C); 1149 break; 1150 case 0x7a: 1151 if (ABCIsConst) 1152 Res = Or(Xor(A, C), Nor(B, Not(A))); 1153 break; 1154 case 0x7b: 1155 if (ABCIsConst) 1156 Res = Nand(Xnor(A, C), B); 1157 break; 1158 case 0x7c: 1159 if (ABCIsConst) 1160 Res = Or(Xor(A, B), Nor(C, Not(A))); 1161 break; 1162 case 0x7d: 1163 if (ABCIsConst) 1164 Res = Nand(Xnor(A, B), C); 1165 break; 1166 case 0x7e: 1167 if (ABCIsConst) 1168 Res = Or(Xor(A, B), Xor(A, C)); 1169 break; 1170 case 0x7f: 1171 if (ABCIsConst) 1172 Res = Nand(And(A, B), C); 1173 break; 1174 case 0x80: 1175 if (ABCIsConst) 1176 Res = And(And(A, B), C); 1177 break; 1178 case 0x81: 1179 if (ABCIsConst) 1180 Res = Nor(Xor(A, B), Xor(A, C)); 1181 break; 1182 case 0x82: 1183 if (ABCIsConst) 1184 Res = And(Xnor(A, B), C); 1185 break; 1186 case 0x83: 1187 if (ABCIsConst) 1188 Res = Nor(Xor(A, B), Nor(C, Not(A))); 1189 break; 1190 case 0x84: 1191 if (ABCIsConst) 1192 Res = And(Xnor(A, C), B); 1193 break; 1194 case 0x85: 1195 if (ABCIsConst) 1196 Res = Nor(Xor(A, C), Nor(B, Not(A))); 1197 break; 1198 case 0x86: 1199 if (ABCIsConst) 1200 Res = Xor(Nor(Xnor(A, B), Nor(B, C)), C); 1201 break; 1202 case 0x87: 1203 if (ABCIsConst) 1204 Res = Xor(A, Nand(B, C)); 1205 break; 1206 case 0x88: 1207 Res = And(B, C); 1208 break; 1209 case 0x89: 1210 if (ABCIsConst) 1211 Res = Xor(B, Nor(Nor(B, Not(A)), C)); 1212 break; 1213 case 0x8a: 1214 if (ABCIsConst) 1215 Res = And(Nand(A, Not(B)), C); 1216 break; 1217 case 0x8b: 1218 if (ABCIsConst) 1219 Res = Xor(Nor(Xor(A, C), B), C); 1220 break; 1221 case 0x8c: 1222 if (ABCIsConst) 1223 Res = And(Nand(A, Not(C)), B); 1224 break; 1225 case 0x8d: 1226 if (ABCIsConst) 1227 Res = Xor(Nor(Xor(A, B), C), B); 1228 break; 1229 case 0x8e: 1230 if (ABCIsConst) 1231 Res = Xor(Or(Xor(A, B), Xor(A, C)), A); 1232 break; 1233 case 0x8f: 1234 if (ABCIsConst) 1235 Res = Nand(A, Nand(B, C)); 1236 break; 1237 case 0x90: 1238 if (ABCIsConst) 1239 Res = And(A, Xnor(B, C)); 1240 break; 1241 case 0x91: 1242 if (ABCIsConst) 1243 Res = Nor(Nor(A, Not(B)), Xor(B, C)); 1244 break; 1245 case 0x92: 1246 if (ABCIsConst) 1247 Res = Xor(Nor(Xnor(A, B), Nor(A, C)), C); 1248 break; 1249 case 0x93: 1250 if (ABCIsConst) 1251 Res = Xor(Nand(A, C), B); 1252 break; 1253 case 0x94: 1254 if (ABCIsConst) 1255 Res = Nor(Nor(A, B), Xor(Xnor(A, B), C)); 1256 break; 1257 case 0x95: 1258 if (ABCIsConst) 1259 Res = Xor(Nand(A, B), C); 1260 break; 1261 case 0x96: 1262 if (ABCIsConst) 1263 Res = Xor(Xor(A, B), C); 1264 break; 1265 case 0x97: 1266 if (ABCIsConst) 1267 Res = Xor(Xor(A, B), Or(Nor(A, B), C)); 1268 break; 1269 case 0x98: 1270 if (ABCIsConst) 1271 Res = Nor(Nor(A, B), Xor(B, C)); 1272 break; 1273 case 0x99: 1274 if (BCIsConst) 1275 Res = Xnor(B, C); 1276 break; 1277 case 0x9a: 1278 if (ABCIsConst) 1279 Res = Xor(Nor(B, Not(A)), C); 1280 break; 1281 case 0x9b: 1282 if (ABCIsConst) 1283 Res = Or(Nor(A, B), Xnor(B, C)); 1284 break; 1285 case 0x9c: 1286 if (ABCIsConst) 1287 Res = Xor(B, Nor(C, Not(A))); 1288 break; 1289 case 0x9d: 1290 if (ABCIsConst) 1291 Res = Or(Nor(A, C), Xnor(B, C)); 1292 break; 1293 case 0x9e: 1294 if (ABCIsConst) 1295 Res = Xor(And(Xor(A, B), Nand(B, C)), C); 1296 break; 1297 case 0x9f: 1298 if (ABCIsConst) 1299 Res = Nand(A, Xor(B, C)); 1300 break; 1301 case 0xa0: 1302 Res = And(A, C); 1303 break; 1304 case 0xa1: 1305 if (ABCIsConst) 1306 Res = Xor(A, Nor(Nor(A, Not(B)), C)); 1307 break; 1308 case 0xa2: 1309 if (ABCIsConst) 1310 Res = And(Or(A, Not(B)), C); 1311 break; 1312 case 0xa3: 1313 if (ABCIsConst) 1314 Res = Xor(Nor(Xor(B, C), A), C); 1315 break; 1316 case 0xa4: 1317 if (ABCIsConst) 1318 Res = Xor(A, Nor(Nor(A, B), C)); 1319 break; 1320 case 0xa5: 1321 if (ACIsConst) 1322 Res = Xnor(A, C); 1323 break; 1324 case 0xa6: 1325 if (ABCIsConst) 1326 Res = Xor(Nor(A, Not(B)), C); 1327 break; 1328 case 0xa7: 1329 if (ABCIsConst) 1330 Res = Or(Nor(A, B), Xnor(A, C)); 1331 break; 1332 case 0xa8: 1333 if (ABCIsConst) 1334 Res = And(Or(A, B), C); 1335 break; 1336 case 0xa9: 1337 if (ABCIsConst) 1338 Res = Xor(Nor(A, B), C); 1339 break; 1340 case 0xaa: 1341 Res = C; 1342 break; 1343 case 0xab: 1344 if (ABCIsConst) 1345 Res = Or(Nor(A, B), C); 1346 break; 1347 case 0xac: 1348 if (ABCIsConst) 1349 Res = Xor(Nor(Xnor(B, C), A), C); 1350 break; 1351 case 0xad: 1352 if (ABCIsConst) 1353 Res = Or(Xnor(A, C), And(B, C)); 1354 break; 1355 case 0xae: 1356 if (ABCIsConst) 1357 Res = Or(Nor(A, Not(B)), C); 1358 break; 1359 case 0xaf: 1360 if (ACIsConst) 1361 Res = Or(C, Not(A)); 1362 break; 1363 case 0xb0: 1364 if (ABCIsConst) 1365 Res = And(A, Nand(B, Not(C))); 1366 break; 1367 case 0xb1: 1368 if (ABCIsConst) 1369 Res = Xor(A, Nor(Xor(A, B), C)); 1370 break; 1371 case 0xb2: 1372 if (ABCIsConst) 1373 Res = Xor(Nor(Xor(A, B), Xnor(A, C)), A); 1374 break; 1375 case 0xb3: 1376 if (ABCIsConst) 1377 Res = Nand(Nand(A, C), B); 1378 break; 1379 case 0xb4: 1380 if (ABCIsConst) 1381 Res = Xor(A, Nor(C, Not(B))); 1382 break; 1383 case 0xb5: 1384 if (ABCIsConst) 1385 Res = Or(Xnor(A, C), Nor(B, C)); 1386 break; 1387 case 0xb6: 1388 if (ABCIsConst) 1389 Res = Xor(And(Xor(A, B), Nand(A, C)), C); 1390 break; 1391 case 0xb7: 1392 if (ABCIsConst) 1393 Res = Nand(Xor(A, C), B); 1394 break; 1395 case 0xb8: 1396 if (ABCIsConst) 1397 Res = Xor(Nor(Xnor(A, C), B), C); 1398 break; 1399 case 0xb9: 1400 if (ABCIsConst) 1401 Res = Xor(Nor(And(A, C), B), C); 1402 break; 1403 case 0xba: 1404 if (ABCIsConst) 1405 Res = Or(Nor(B, Not(A)), C); 1406 break; 1407 case 0xbb: 1408 if (BCIsConst) 1409 Res = Or(C, Not(B)); 1410 break; 1411 case 0xbc: 1412 if (ABCIsConst) 1413 Res = Xor(A, And(Nand(A, C), B)); 1414 break; 1415 case 0xbd: 1416 if (ABCIsConst) 1417 Res = Or(Xor(A, B), Xnor(A, C)); 1418 break; 1419 case 0xbe: 1420 if (ABCIsConst) 1421 Res = Or(Xor(A, B), C); 1422 break; 1423 case 0xbf: 1424 if (ABCIsConst) 1425 Res = Or(Nand(A, B), C); 1426 break; 1427 case 0xc0: 1428 Res = And(A, B); 1429 break; 1430 case 0xc1: 1431 if (ABCIsConst) 1432 Res = Xor(A, Nor(Nor(A, Not(C)), B)); 1433 break; 1434 case 0xc2: 1435 if (ABCIsConst) 1436 Res = Xor(A, Nor(Nor(A, C), B)); 1437 break; 1438 case 0xc3: 1439 if (ABIsConst) 1440 Res = Xnor(A, B); 1441 break; 1442 case 0xc4: 1443 if (ABCIsConst) 1444 Res = And(Or(A, Not(C)), B); 1445 break; 1446 case 0xc5: 1447 if (ABCIsConst) 1448 Res = Xor(B, Nor(A, Xor(B, C))); 1449 break; 1450 case 0xc6: 1451 if (ABCIsConst) 1452 Res = Xor(Nor(A, Not(C)), B); 1453 break; 1454 case 0xc7: 1455 if (ABCIsConst) 1456 Res = Or(Xnor(A, B), Nor(A, C)); 1457 break; 1458 case 0xc8: 1459 if (ABCIsConst) 1460 Res = And(Or(A, C), B); 1461 break; 1462 case 0xc9: 1463 if (ABCIsConst) 1464 Res = Xor(Nor(A, C), B); 1465 break; 1466 case 0xca: 1467 if (ABCIsConst) 1468 Res = Xor(B, Nor(A, Xnor(B, C))); 1469 break; 1470 case 0xcb: 1471 if (ABCIsConst) 1472 Res = Or(Xnor(A, B), And(B, C)); 1473 break; 1474 case 0xcc: 1475 Res = B; 1476 break; 1477 case 0xcd: 1478 if (ABCIsConst) 1479 Res = Or(Nor(A, C), B); 1480 break; 1481 case 0xce: 1482 if (ABCIsConst) 1483 Res = Or(Nor(A, Not(C)), B); 1484 break; 1485 case 0xcf: 1486 if (ABIsConst) 1487 Res = Or(B, Not(A)); 1488 break; 1489 case 0xd0: 1490 if (ABCIsConst) 1491 Res = And(A, Or(B, Not(C))); 1492 break; 1493 case 0xd1: 1494 if (ABCIsConst) 1495 Res = Xor(A, Nor(Xor(A, C), B)); 1496 break; 1497 case 0xd2: 1498 if (ABCIsConst) 1499 Res = Xor(A, Nor(B, Not(C))); 1500 break; 1501 case 0xd3: 1502 if (ABCIsConst) 1503 Res = Or(Xnor(A, B), Nor(B, C)); 1504 break; 1505 case 0xd4: 1506 if (ABCIsConst) 1507 Res = Xor(Nor(Xnor(A, B), Xor(A, C)), A); 1508 break; 1509 case 0xd5: 1510 if (ABCIsConst) 1511 Res = Nand(Nand(A, B), C); 1512 break; 1513 case 0xd6: 1514 if (ABCIsConst) 1515 Res = Xor(Xor(A, B), Or(And(A, B), C)); 1516 break; 1517 case 0xd7: 1518 if (ABCIsConst) 1519 Res = Nand(Xor(A, B), C); 1520 break; 1521 case 0xd8: 1522 if (ABCIsConst) 1523 Res = Xor(Nor(Xnor(A, B), C), B); 1524 break; 1525 case 0xd9: 1526 if (ABCIsConst) 1527 Res = Or(And(A, B), Xnor(B, C)); 1528 break; 1529 case 0xda: 1530 if (ABCIsConst) 1531 Res = Xor(A, And(Nand(A, B), C)); 1532 break; 1533 case 0xdb: 1534 if (ABCIsConst) 1535 Res = Or(Xnor(A, B), Xor(A, C)); 1536 break; 1537 case 0xdc: 1538 if (ABCIsConst) 1539 Res = Or(B, Nor(C, Not(A))); 1540 break; 1541 case 0xdd: 1542 if (BCIsConst) 1543 Res = Or(B, Not(C)); 1544 break; 1545 case 0xde: 1546 if (ABCIsConst) 1547 Res = Or(Xor(A, C), B); 1548 break; 1549 case 0xdf: 1550 if (ABCIsConst) 1551 Res = Or(Nand(A, C), B); 1552 break; 1553 case 0xe0: 1554 if (ABCIsConst) 1555 Res = And(A, Or(B, C)); 1556 break; 1557 case 0xe1: 1558 if (ABCIsConst) 1559 Res = Xor(A, Nor(B, C)); 1560 break; 1561 case 0xe2: 1562 if (ABCIsConst) 1563 Res = Xor(A, Nor(Xnor(A, C), B)); 1564 break; 1565 case 0xe3: 1566 if (ABCIsConst) 1567 Res = Xor(A, Nor(And(A, C), B)); 1568 break; 1569 case 0xe4: 1570 if (ABCIsConst) 1571 Res = Xor(A, Nor(Xnor(A, B), C)); 1572 break; 1573 case 0xe5: 1574 if (ABCIsConst) 1575 Res = Xor(A, Nor(And(A, B), C)); 1576 break; 1577 case 0xe6: 1578 if (ABCIsConst) 1579 Res = Or(And(A, B), Xor(B, C)); 1580 break; 1581 case 0xe7: 1582 if (ABCIsConst) 1583 Res = Or(Xnor(A, B), Xnor(A, C)); 1584 break; 1585 case 0xe8: 1586 if (ABCIsConst) 1587 Res = Xor(Or(A, B), Nor(Xnor(A, B), C)); 1588 break; 1589 case 0xe9: 1590 if (ABCIsConst) 1591 Res = Xor(Xor(A, B), Nand(Nand(A, B), C)); 1592 break; 1593 case 0xea: 1594 if (ABCIsConst) 1595 Res = Or(And(A, B), C); 1596 break; 1597 case 0xeb: 1598 if (ABCIsConst) 1599 Res = Or(Xnor(A, B), C); 1600 break; 1601 case 0xec: 1602 if (ABCIsConst) 1603 Res = Or(And(A, C), B); 1604 break; 1605 case 0xed: 1606 if (ABCIsConst) 1607 Res = Or(Xnor(A, C), B); 1608 break; 1609 case 0xee: 1610 Res = Or(B, C); 1611 break; 1612 case 0xef: 1613 if (ABCIsConst) 1614 Res = Nand(A, Nor(B, C)); 1615 break; 1616 case 0xf0: 1617 Res = A; 1618 break; 1619 case 0xf1: 1620 if (ABCIsConst) 1621 Res = Or(A, Nor(B, C)); 1622 break; 1623 case 0xf2: 1624 if (ABCIsConst) 1625 Res = Or(A, Nor(B, Not(C))); 1626 break; 1627 case 0xf3: 1628 if (ABIsConst) 1629 Res = Or(A, Not(B)); 1630 break; 1631 case 0xf4: 1632 if (ABCIsConst) 1633 Res = Or(A, Nor(C, Not(B))); 1634 break; 1635 case 0xf5: 1636 if (ACIsConst) 1637 Res = Or(A, Not(C)); 1638 break; 1639 case 0xf6: 1640 if (ABCIsConst) 1641 Res = Or(A, Xor(B, C)); 1642 break; 1643 case 0xf7: 1644 if (ABCIsConst) 1645 Res = Or(A, Nand(B, C)); 1646 break; 1647 case 0xf8: 1648 if (ABCIsConst) 1649 Res = Or(A, And(B, C)); 1650 break; 1651 case 0xf9: 1652 if (ABCIsConst) 1653 Res = Or(A, Xnor(B, C)); 1654 break; 1655 case 0xfa: 1656 Res = Or(A, C); 1657 break; 1658 case 0xfb: 1659 if (ABCIsConst) 1660 Res = Nand(Nor(A, C), B); 1661 break; 1662 case 0xfc: 1663 Res = Or(A, B); 1664 break; 1665 case 0xfd: 1666 if (ABCIsConst) 1667 Res = Nand(Nor(A, B), C); 1668 break; 1669 case 0xfe: 1670 if (ABCIsConst) 1671 Res = Or(Or(A, B), C); 1672 break; 1673 case 0xff: 1674 Res = {Constant::getAllOnesValue(Ty), 0xff}; 1675 break; 1676 } 1677 1678 assert((Res.first == nullptr || Res.second == Imm) && 1679 "Simplification of ternary logic does not verify!"); 1680 return Res.first; 1681 } 1682 1683 static Value *simplifyX86insertps(const IntrinsicInst &II, 1684 InstCombiner::BuilderTy &Builder) { 1685 auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2)); 1686 if (!CInt) 1687 return nullptr; 1688 1689 auto *VecTy = cast<FixedVectorType>(II.getType()); 1690 assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type"); 1691 1692 // The immediate permute control byte looks like this: 1693 // [3:0] - zero mask for each 32-bit lane 1694 // [5:4] - select one 32-bit destination lane 1695 // [7:6] - select one 32-bit source lane 1696 1697 uint8_t Imm = CInt->getZExtValue(); 1698 uint8_t ZMask = Imm & 0xf; 1699 uint8_t DestLane = (Imm >> 4) & 0x3; 1700 uint8_t SourceLane = (Imm >> 6) & 0x3; 1701 1702 ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy); 1703 1704 // If all zero mask bits are set, this was just a weird way to 1705 // generate a zero vector. 1706 if (ZMask == 0xf) 1707 return ZeroVector; 1708 1709 // Initialize by passing all of the first source bits through. 1710 int ShuffleMask[4] = {0, 1, 2, 3}; 1711 1712 // We may replace the second operand with the zero vector. 1713 Value *V1 = II.getArgOperand(1); 1714 1715 if (ZMask) { 1716 // If the zero mask is being used with a single input or the zero mask 1717 // overrides the destination lane, this is a shuffle with the zero vector. 1718 if ((II.getArgOperand(0) == II.getArgOperand(1)) || 1719 (ZMask & (1 << DestLane))) { 1720 V1 = ZeroVector; 1721 // We may still move 32-bits of the first source vector from one lane 1722 // to another. 1723 ShuffleMask[DestLane] = SourceLane; 1724 // The zero mask may override the previous insert operation. 1725 for (unsigned i = 0; i < 4; ++i) 1726 if ((ZMask >> i) & 0x1) 1727 ShuffleMask[i] = i + 4; 1728 } else { 1729 // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle? 1730 return nullptr; 1731 } 1732 } else { 1733 // Replace the selected destination lane with the selected source lane. 1734 ShuffleMask[DestLane] = SourceLane + 4; 1735 } 1736 1737 return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask); 1738 } 1739 1740 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding 1741 /// or conversion to a shuffle vector. 1742 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, 1743 ConstantInt *CILength, ConstantInt *CIIndex, 1744 InstCombiner::BuilderTy &Builder) { 1745 auto LowConstantHighUndef = [&](uint64_t Val) { 1746 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1747 Constant *Args[] = {ConstantInt::get(IntTy64, Val), 1748 UndefValue::get(IntTy64)}; 1749 return ConstantVector::get(Args); 1750 }; 1751 1752 // See if we're dealing with constant values. 1753 auto *C0 = dyn_cast<Constant>(Op0); 1754 auto *CI0 = 1755 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1756 : nullptr; 1757 1758 // Attempt to constant fold. 1759 if (CILength && CIIndex) { 1760 // From AMD documentation: "The bit index and field length are each six 1761 // bits in length other bits of the field are ignored." 1762 APInt APIndex = CIIndex->getValue().zextOrTrunc(6); 1763 APInt APLength = CILength->getValue().zextOrTrunc(6); 1764 1765 unsigned Index = APIndex.getZExtValue(); 1766 1767 // From AMD documentation: "a value of zero in the field length is 1768 // defined as length of 64". 1769 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1770 1771 // From AMD documentation: "If the sum of the bit index + length field 1772 // is greater than 64, the results are undefined". 1773 unsigned End = Index + Length; 1774 1775 // Note that both field index and field length are 8-bit quantities. 1776 // Since variables 'Index' and 'Length' are unsigned values 1777 // obtained from zero-extending field index and field length 1778 // respectively, their sum should never wrap around. 1779 if (End > 64) 1780 return UndefValue::get(II.getType()); 1781 1782 // If we are inserting whole bytes, we can convert this to a shuffle. 1783 // Lowering can recognize EXTRQI shuffle masks. 1784 if ((Length % 8) == 0 && (Index % 8) == 0) { 1785 // Convert bit indices to byte indices. 1786 Length /= 8; 1787 Index /= 8; 1788 1789 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1790 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1791 1792 SmallVector<int, 16> ShuffleMask; 1793 for (int i = 0; i != (int)Length; ++i) 1794 ShuffleMask.push_back(i + Index); 1795 for (int i = Length; i != 8; ++i) 1796 ShuffleMask.push_back(i + 16); 1797 for (int i = 8; i != 16; ++i) 1798 ShuffleMask.push_back(-1); 1799 1800 Value *SV = Builder.CreateShuffleVector( 1801 Builder.CreateBitCast(Op0, ShufTy), 1802 ConstantAggregateZero::get(ShufTy), ShuffleMask); 1803 return Builder.CreateBitCast(SV, II.getType()); 1804 } 1805 1806 // Constant Fold - shift Index'th bit to lowest position and mask off 1807 // Length bits. 1808 if (CI0) { 1809 APInt Elt = CI0->getValue(); 1810 Elt.lshrInPlace(Index); 1811 Elt = Elt.zextOrTrunc(Length); 1812 return LowConstantHighUndef(Elt.getZExtValue()); 1813 } 1814 1815 // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. 1816 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { 1817 Value *Args[] = {Op0, CILength, CIIndex}; 1818 Module *M = II.getModule(); 1819 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); 1820 return Builder.CreateCall(F, Args); 1821 } 1822 } 1823 1824 // Constant Fold - extraction from zero is always {zero, undef}. 1825 if (CI0 && CI0->isZero()) 1826 return LowConstantHighUndef(0); 1827 1828 return nullptr; 1829 } 1830 1831 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant 1832 /// folding or conversion to a shuffle vector. 1833 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, 1834 APInt APLength, APInt APIndex, 1835 InstCombiner::BuilderTy &Builder) { 1836 // From AMD documentation: "The bit index and field length are each six bits 1837 // in length other bits of the field are ignored." 1838 APIndex = APIndex.zextOrTrunc(6); 1839 APLength = APLength.zextOrTrunc(6); 1840 1841 // Attempt to constant fold. 1842 unsigned Index = APIndex.getZExtValue(); 1843 1844 // From AMD documentation: "a value of zero in the field length is 1845 // defined as length of 64". 1846 unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue(); 1847 1848 // From AMD documentation: "If the sum of the bit index + length field 1849 // is greater than 64, the results are undefined". 1850 unsigned End = Index + Length; 1851 1852 // Note that both field index and field length are 8-bit quantities. 1853 // Since variables 'Index' and 'Length' are unsigned values 1854 // obtained from zero-extending field index and field length 1855 // respectively, their sum should never wrap around. 1856 if (End > 64) 1857 return UndefValue::get(II.getType()); 1858 1859 // If we are inserting whole bytes, we can convert this to a shuffle. 1860 // Lowering can recognize INSERTQI shuffle masks. 1861 if ((Length % 8) == 0 && (Index % 8) == 0) { 1862 // Convert bit indices to byte indices. 1863 Length /= 8; 1864 Index /= 8; 1865 1866 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1867 auto *ShufTy = FixedVectorType::get(IntTy8, 16); 1868 1869 SmallVector<int, 16> ShuffleMask; 1870 for (int i = 0; i != (int)Index; ++i) 1871 ShuffleMask.push_back(i); 1872 for (int i = 0; i != (int)Length; ++i) 1873 ShuffleMask.push_back(i + 16); 1874 for (int i = Index + Length; i != 8; ++i) 1875 ShuffleMask.push_back(i); 1876 for (int i = 8; i != 16; ++i) 1877 ShuffleMask.push_back(-1); 1878 1879 Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy), 1880 Builder.CreateBitCast(Op1, ShufTy), 1881 ShuffleMask); 1882 return Builder.CreateBitCast(SV, II.getType()); 1883 } 1884 1885 // See if we're dealing with constant values. 1886 auto *C0 = dyn_cast<Constant>(Op0); 1887 auto *C1 = dyn_cast<Constant>(Op1); 1888 auto *CI00 = 1889 C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0)) 1890 : nullptr; 1891 auto *CI10 = 1892 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 1893 : nullptr; 1894 1895 // Constant Fold - insert bottom Length bits starting at the Index'th bit. 1896 if (CI00 && CI10) { 1897 APInt V00 = CI00->getValue(); 1898 APInt V10 = CI10->getValue(); 1899 APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index); 1900 V00 = V00 & ~Mask; 1901 V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index); 1902 APInt Val = V00 | V10; 1903 Type *IntTy64 = Type::getInt64Ty(II.getContext()); 1904 Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()), 1905 UndefValue::get(IntTy64)}; 1906 return ConstantVector::get(Args); 1907 } 1908 1909 // If we were an INSERTQ call, we'll save demanded elements if we convert to 1910 // INSERTQI. 1911 if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) { 1912 Type *IntTy8 = Type::getInt8Ty(II.getContext()); 1913 Constant *CILength = ConstantInt::get(IntTy8, Length, false); 1914 Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); 1915 1916 Value *Args[] = {Op0, Op1, CILength, CIIndex}; 1917 Module *M = II.getModule(); 1918 Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); 1919 return Builder.CreateCall(F, Args); 1920 } 1921 1922 return nullptr; 1923 } 1924 1925 /// Attempt to convert pshufb* to shufflevector if the mask is constant. 1926 static Value *simplifyX86pshufb(const IntrinsicInst &II, 1927 InstCombiner::BuilderTy &Builder) { 1928 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1929 if (!V) 1930 return nullptr; 1931 1932 auto *VecTy = cast<FixedVectorType>(II.getType()); 1933 unsigned NumElts = VecTy->getNumElements(); 1934 assert((NumElts == 16 || NumElts == 32 || NumElts == 64) && 1935 "Unexpected number of elements in shuffle mask!"); 1936 1937 // Construct a shuffle mask from constant integers or UNDEFs. 1938 int Indexes[64]; 1939 1940 // Each byte in the shuffle control mask forms an index to permute the 1941 // corresponding byte in the destination operand. 1942 for (unsigned I = 0; I < NumElts; ++I) { 1943 Constant *COp = V->getAggregateElement(I); 1944 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1945 return nullptr; 1946 1947 if (isa<UndefValue>(COp)) { 1948 Indexes[I] = -1; 1949 continue; 1950 } 1951 1952 int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue(); 1953 1954 // If the most significant bit (bit[7]) of each byte of the shuffle 1955 // control mask is set, then zero is written in the result byte. 1956 // The zero vector is in the right-hand side of the resulting 1957 // shufflevector. 1958 1959 // The value of each index for the high 128-bit lane is the least 1960 // significant 4 bits of the respective shuffle control byte. 1961 Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0); 1962 Indexes[I] = Index; 1963 } 1964 1965 auto V1 = II.getArgOperand(0); 1966 auto V2 = Constant::getNullValue(VecTy); 1967 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts)); 1968 } 1969 1970 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant. 1971 static Value *simplifyX86vpermilvar(const IntrinsicInst &II, 1972 InstCombiner::BuilderTy &Builder) { 1973 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 1974 if (!V) 1975 return nullptr; 1976 1977 auto *VecTy = cast<FixedVectorType>(II.getType()); 1978 unsigned NumElts = VecTy->getNumElements(); 1979 bool IsPD = VecTy->getScalarType()->isDoubleTy(); 1980 unsigned NumLaneElts = IsPD ? 2 : 4; 1981 assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2); 1982 1983 // Construct a shuffle mask from constant integers or UNDEFs. 1984 int Indexes[16]; 1985 1986 // The intrinsics only read one or two bits, clear the rest. 1987 for (unsigned I = 0; I < NumElts; ++I) { 1988 Constant *COp = V->getAggregateElement(I); 1989 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 1990 return nullptr; 1991 1992 if (isa<UndefValue>(COp)) { 1993 Indexes[I] = -1; 1994 continue; 1995 } 1996 1997 APInt Index = cast<ConstantInt>(COp)->getValue(); 1998 Index = Index.zextOrTrunc(32).getLoBits(2); 1999 2000 // The PD variants uses bit 1 to select per-lane element index, so 2001 // shift down to convert to generic shuffle mask index. 2002 if (IsPD) 2003 Index.lshrInPlace(1); 2004 2005 // The _256 variants are a bit trickier since the mask bits always index 2006 // into the corresponding 128 half. In order to convert to a generic 2007 // shuffle, we have to make that explicit. 2008 Index += APInt(32, (I / NumLaneElts) * NumLaneElts); 2009 2010 Indexes[I] = Index.getZExtValue(); 2011 } 2012 2013 auto V1 = II.getArgOperand(0); 2014 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, NumElts)); 2015 } 2016 2017 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant. 2018 static Value *simplifyX86vpermv(const IntrinsicInst &II, 2019 InstCombiner::BuilderTy &Builder) { 2020 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2021 if (!V) 2022 return nullptr; 2023 2024 auto *VecTy = cast<FixedVectorType>(II.getType()); 2025 unsigned Size = VecTy->getNumElements(); 2026 assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) && 2027 "Unexpected shuffle mask size"); 2028 2029 // Construct a shuffle mask from constant integers or UNDEFs. 2030 int Indexes[64]; 2031 2032 for (unsigned I = 0; I < Size; ++I) { 2033 Constant *COp = V->getAggregateElement(I); 2034 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2035 return nullptr; 2036 2037 if (isa<UndefValue>(COp)) { 2038 Indexes[I] = -1; 2039 continue; 2040 } 2041 2042 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 2043 Index &= Size - 1; 2044 Indexes[I] = Index; 2045 } 2046 2047 auto V1 = II.getArgOperand(0); 2048 return Builder.CreateShuffleVector(V1, ArrayRef(Indexes, Size)); 2049 } 2050 2051 /// Attempt to convert vpermi2/vpermt2 to shufflevector if the mask is constant. 2052 static Value *simplifyX86vpermv3(const IntrinsicInst &II, 2053 InstCombiner::BuilderTy &Builder) { 2054 auto *V = dyn_cast<Constant>(II.getArgOperand(1)); 2055 if (!V) 2056 return nullptr; 2057 2058 auto *VecTy = cast<FixedVectorType>(II.getType()); 2059 unsigned Size = VecTy->getNumElements(); 2060 assert((Size == 2 || Size == 4 || Size == 8 || Size == 16 || Size == 32 || 2061 Size == 64) && 2062 "Unexpected shuffle mask size"); 2063 2064 // Construct a shuffle mask from constant integers or UNDEFs. 2065 int Indexes[64]; 2066 2067 for (unsigned I = 0; I < Size; ++I) { 2068 Constant *COp = V->getAggregateElement(I); 2069 if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) 2070 return nullptr; 2071 2072 if (isa<UndefValue>(COp)) { 2073 Indexes[I] = -1; 2074 continue; 2075 } 2076 2077 uint32_t Index = cast<ConstantInt>(COp)->getZExtValue(); 2078 Index &= (2 * Size) - 1; 2079 Indexes[I] = Index; 2080 } 2081 2082 auto V1 = II.getArgOperand(0); 2083 auto V2 = II.getArgOperand(2); 2084 return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, Size)); 2085 } 2086 2087 std::optional<Instruction *> 2088 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 2089 auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width, 2090 unsigned DemandedWidth) { 2091 APInt UndefElts(Width, 0); 2092 APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth); 2093 return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts); 2094 }; 2095 2096 Intrinsic::ID IID = II.getIntrinsicID(); 2097 switch (IID) { 2098 case Intrinsic::x86_bmi_bextr_32: 2099 case Intrinsic::x86_bmi_bextr_64: 2100 case Intrinsic::x86_tbm_bextri_u32: 2101 case Intrinsic::x86_tbm_bextri_u64: 2102 // If the RHS is a constant we can try some simplifications. 2103 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2104 uint64_t Shift = C->getZExtValue(); 2105 uint64_t Length = (Shift >> 8) & 0xff; 2106 Shift &= 0xff; 2107 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2108 // If the length is 0 or the shift is out of range, replace with zero. 2109 if (Length == 0 || Shift >= BitWidth) { 2110 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2111 } 2112 // If the LHS is also a constant, we can completely constant fold this. 2113 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2114 uint64_t Result = InC->getZExtValue() >> Shift; 2115 if (Length > BitWidth) 2116 Length = BitWidth; 2117 Result &= maskTrailingOnes<uint64_t>(Length); 2118 return IC.replaceInstUsesWith(II, 2119 ConstantInt::get(II.getType(), Result)); 2120 } 2121 // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we 2122 // are only masking bits that a shift already cleared? 2123 } 2124 break; 2125 2126 case Intrinsic::x86_bmi_bzhi_32: 2127 case Intrinsic::x86_bmi_bzhi_64: 2128 // If the RHS is a constant we can try some simplifications. 2129 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2130 uint64_t Index = C->getZExtValue() & 0xff; 2131 unsigned BitWidth = II.getType()->getIntegerBitWidth(); 2132 if (Index >= BitWidth) { 2133 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2134 } 2135 if (Index == 0) { 2136 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2137 } 2138 // If the LHS is also a constant, we can completely constant fold this. 2139 if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2140 uint64_t Result = InC->getZExtValue(); 2141 Result &= maskTrailingOnes<uint64_t>(Index); 2142 return IC.replaceInstUsesWith(II, 2143 ConstantInt::get(II.getType(), Result)); 2144 } 2145 // TODO should we convert this to an AND if the RHS is constant? 2146 } 2147 break; 2148 case Intrinsic::x86_bmi_pext_32: 2149 case Intrinsic::x86_bmi_pext_64: 2150 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2151 if (MaskC->isNullValue()) { 2152 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2153 } 2154 if (MaskC->isAllOnesValue()) { 2155 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2156 } 2157 2158 unsigned MaskIdx, MaskLen; 2159 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2160 // any single contingous sequence of 1s anywhere in the mask simply 2161 // describes a subset of the input bits shifted to the appropriate 2162 // position. Replace with the straight forward IR. 2163 Value *Input = II.getArgOperand(0); 2164 Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1)); 2165 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2166 Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt); 2167 return IC.replaceInstUsesWith(II, Shifted); 2168 } 2169 2170 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2171 uint64_t Src = SrcC->getZExtValue(); 2172 uint64_t Mask = MaskC->getZExtValue(); 2173 uint64_t Result = 0; 2174 uint64_t BitToSet = 1; 2175 2176 while (Mask) { 2177 // Isolate lowest set bit. 2178 uint64_t BitToTest = Mask & -Mask; 2179 if (BitToTest & Src) 2180 Result |= BitToSet; 2181 2182 BitToSet <<= 1; 2183 // Clear lowest set bit. 2184 Mask &= Mask - 1; 2185 } 2186 2187 return IC.replaceInstUsesWith(II, 2188 ConstantInt::get(II.getType(), Result)); 2189 } 2190 } 2191 break; 2192 case Intrinsic::x86_bmi_pdep_32: 2193 case Intrinsic::x86_bmi_pdep_64: 2194 if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) { 2195 if (MaskC->isNullValue()) { 2196 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0)); 2197 } 2198 if (MaskC->isAllOnesValue()) { 2199 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 2200 } 2201 2202 unsigned MaskIdx, MaskLen; 2203 if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) { 2204 // any single contingous sequence of 1s anywhere in the mask simply 2205 // describes a subset of the input bits shifted to the appropriate 2206 // position. Replace with the straight forward IR. 2207 Value *Input = II.getArgOperand(0); 2208 Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx); 2209 Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt); 2210 Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1)); 2211 return IC.replaceInstUsesWith(II, Masked); 2212 } 2213 2214 if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 2215 uint64_t Src = SrcC->getZExtValue(); 2216 uint64_t Mask = MaskC->getZExtValue(); 2217 uint64_t Result = 0; 2218 uint64_t BitToTest = 1; 2219 2220 while (Mask) { 2221 // Isolate lowest set bit. 2222 uint64_t BitToSet = Mask & -Mask; 2223 if (BitToTest & Src) 2224 Result |= BitToSet; 2225 2226 BitToTest <<= 1; 2227 // Clear lowest set bit; 2228 Mask &= Mask - 1; 2229 } 2230 2231 return IC.replaceInstUsesWith(II, 2232 ConstantInt::get(II.getType(), Result)); 2233 } 2234 } 2235 break; 2236 2237 case Intrinsic::x86_sse_cvtss2si: 2238 case Intrinsic::x86_sse_cvtss2si64: 2239 case Intrinsic::x86_sse_cvttss2si: 2240 case Intrinsic::x86_sse_cvttss2si64: 2241 case Intrinsic::x86_sse2_cvtsd2si: 2242 case Intrinsic::x86_sse2_cvtsd2si64: 2243 case Intrinsic::x86_sse2_cvttsd2si: 2244 case Intrinsic::x86_sse2_cvttsd2si64: 2245 case Intrinsic::x86_avx512_vcvtss2si32: 2246 case Intrinsic::x86_avx512_vcvtss2si64: 2247 case Intrinsic::x86_avx512_vcvtss2usi32: 2248 case Intrinsic::x86_avx512_vcvtss2usi64: 2249 case Intrinsic::x86_avx512_vcvtsd2si32: 2250 case Intrinsic::x86_avx512_vcvtsd2si64: 2251 case Intrinsic::x86_avx512_vcvtsd2usi32: 2252 case Intrinsic::x86_avx512_vcvtsd2usi64: 2253 case Intrinsic::x86_avx512_cvttss2si: 2254 case Intrinsic::x86_avx512_cvttss2si64: 2255 case Intrinsic::x86_avx512_cvttss2usi: 2256 case Intrinsic::x86_avx512_cvttss2usi64: 2257 case Intrinsic::x86_avx512_cvttsd2si: 2258 case Intrinsic::x86_avx512_cvttsd2si64: 2259 case Intrinsic::x86_avx512_cvttsd2usi: 2260 case Intrinsic::x86_avx512_cvttsd2usi64: { 2261 // These intrinsics only demand the 0th element of their input vectors. If 2262 // we can simplify the input based on that, do so now. 2263 Value *Arg = II.getArgOperand(0); 2264 unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements(); 2265 if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) { 2266 return IC.replaceOperand(II, 0, V); 2267 } 2268 break; 2269 } 2270 2271 case Intrinsic::x86_mmx_pmovmskb: 2272 case Intrinsic::x86_sse_movmsk_ps: 2273 case Intrinsic::x86_sse2_movmsk_pd: 2274 case Intrinsic::x86_sse2_pmovmskb_128: 2275 case Intrinsic::x86_avx_movmsk_pd_256: 2276 case Intrinsic::x86_avx_movmsk_ps_256: 2277 case Intrinsic::x86_avx2_pmovmskb: 2278 if (Value *V = simplifyX86movmsk(II, IC.Builder)) { 2279 return IC.replaceInstUsesWith(II, V); 2280 } 2281 break; 2282 2283 case Intrinsic::x86_sse_comieq_ss: 2284 case Intrinsic::x86_sse_comige_ss: 2285 case Intrinsic::x86_sse_comigt_ss: 2286 case Intrinsic::x86_sse_comile_ss: 2287 case Intrinsic::x86_sse_comilt_ss: 2288 case Intrinsic::x86_sse_comineq_ss: 2289 case Intrinsic::x86_sse_ucomieq_ss: 2290 case Intrinsic::x86_sse_ucomige_ss: 2291 case Intrinsic::x86_sse_ucomigt_ss: 2292 case Intrinsic::x86_sse_ucomile_ss: 2293 case Intrinsic::x86_sse_ucomilt_ss: 2294 case Intrinsic::x86_sse_ucomineq_ss: 2295 case Intrinsic::x86_sse2_comieq_sd: 2296 case Intrinsic::x86_sse2_comige_sd: 2297 case Intrinsic::x86_sse2_comigt_sd: 2298 case Intrinsic::x86_sse2_comile_sd: 2299 case Intrinsic::x86_sse2_comilt_sd: 2300 case Intrinsic::x86_sse2_comineq_sd: 2301 case Intrinsic::x86_sse2_ucomieq_sd: 2302 case Intrinsic::x86_sse2_ucomige_sd: 2303 case Intrinsic::x86_sse2_ucomigt_sd: 2304 case Intrinsic::x86_sse2_ucomile_sd: 2305 case Intrinsic::x86_sse2_ucomilt_sd: 2306 case Intrinsic::x86_sse2_ucomineq_sd: 2307 case Intrinsic::x86_avx512_vcomi_ss: 2308 case Intrinsic::x86_avx512_vcomi_sd: 2309 case Intrinsic::x86_avx512_mask_cmp_ss: 2310 case Intrinsic::x86_avx512_mask_cmp_sd: { 2311 // These intrinsics only demand the 0th element of their input vectors. If 2312 // we can simplify the input based on that, do so now. 2313 bool MadeChange = false; 2314 Value *Arg0 = II.getArgOperand(0); 2315 Value *Arg1 = II.getArgOperand(1); 2316 unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2317 if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) { 2318 IC.replaceOperand(II, 0, V); 2319 MadeChange = true; 2320 } 2321 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) { 2322 IC.replaceOperand(II, 1, V); 2323 MadeChange = true; 2324 } 2325 if (MadeChange) { 2326 return &II; 2327 } 2328 break; 2329 } 2330 2331 case Intrinsic::x86_avx512_add_ps_512: 2332 case Intrinsic::x86_avx512_div_ps_512: 2333 case Intrinsic::x86_avx512_mul_ps_512: 2334 case Intrinsic::x86_avx512_sub_ps_512: 2335 case Intrinsic::x86_avx512_add_pd_512: 2336 case Intrinsic::x86_avx512_div_pd_512: 2337 case Intrinsic::x86_avx512_mul_pd_512: 2338 case Intrinsic::x86_avx512_sub_pd_512: 2339 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2340 // IR operations. 2341 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2342 if (R->getValue() == 4) { 2343 Value *Arg0 = II.getArgOperand(0); 2344 Value *Arg1 = II.getArgOperand(1); 2345 2346 Value *V; 2347 switch (IID) { 2348 default: 2349 llvm_unreachable("Case stmts out of sync!"); 2350 case Intrinsic::x86_avx512_add_ps_512: 2351 case Intrinsic::x86_avx512_add_pd_512: 2352 V = IC.Builder.CreateFAdd(Arg0, Arg1); 2353 break; 2354 case Intrinsic::x86_avx512_sub_ps_512: 2355 case Intrinsic::x86_avx512_sub_pd_512: 2356 V = IC.Builder.CreateFSub(Arg0, Arg1); 2357 break; 2358 case Intrinsic::x86_avx512_mul_ps_512: 2359 case Intrinsic::x86_avx512_mul_pd_512: 2360 V = IC.Builder.CreateFMul(Arg0, Arg1); 2361 break; 2362 case Intrinsic::x86_avx512_div_ps_512: 2363 case Intrinsic::x86_avx512_div_pd_512: 2364 V = IC.Builder.CreateFDiv(Arg0, Arg1); 2365 break; 2366 } 2367 2368 return IC.replaceInstUsesWith(II, V); 2369 } 2370 } 2371 break; 2372 2373 case Intrinsic::x86_avx512_mask_add_ss_round: 2374 case Intrinsic::x86_avx512_mask_div_ss_round: 2375 case Intrinsic::x86_avx512_mask_mul_ss_round: 2376 case Intrinsic::x86_avx512_mask_sub_ss_round: 2377 case Intrinsic::x86_avx512_mask_add_sd_round: 2378 case Intrinsic::x86_avx512_mask_div_sd_round: 2379 case Intrinsic::x86_avx512_mask_mul_sd_round: 2380 case Intrinsic::x86_avx512_mask_sub_sd_round: 2381 // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular 2382 // IR operations. 2383 if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) { 2384 if (R->getValue() == 4) { 2385 // Extract the element as scalars. 2386 Value *Arg0 = II.getArgOperand(0); 2387 Value *Arg1 = II.getArgOperand(1); 2388 Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0); 2389 Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0); 2390 2391 Value *V; 2392 switch (IID) { 2393 default: 2394 llvm_unreachable("Case stmts out of sync!"); 2395 case Intrinsic::x86_avx512_mask_add_ss_round: 2396 case Intrinsic::x86_avx512_mask_add_sd_round: 2397 V = IC.Builder.CreateFAdd(LHS, RHS); 2398 break; 2399 case Intrinsic::x86_avx512_mask_sub_ss_round: 2400 case Intrinsic::x86_avx512_mask_sub_sd_round: 2401 V = IC.Builder.CreateFSub(LHS, RHS); 2402 break; 2403 case Intrinsic::x86_avx512_mask_mul_ss_round: 2404 case Intrinsic::x86_avx512_mask_mul_sd_round: 2405 V = IC.Builder.CreateFMul(LHS, RHS); 2406 break; 2407 case Intrinsic::x86_avx512_mask_div_ss_round: 2408 case Intrinsic::x86_avx512_mask_div_sd_round: 2409 V = IC.Builder.CreateFDiv(LHS, RHS); 2410 break; 2411 } 2412 2413 // Handle the masking aspect of the intrinsic. 2414 Value *Mask = II.getArgOperand(3); 2415 auto *C = dyn_cast<ConstantInt>(Mask); 2416 // We don't need a select if we know the mask bit is a 1. 2417 if (!C || !C->getValue()[0]) { 2418 // Cast the mask to an i1 vector and then extract the lowest element. 2419 auto *MaskTy = FixedVectorType::get( 2420 IC.Builder.getInt1Ty(), 2421 cast<IntegerType>(Mask->getType())->getBitWidth()); 2422 Mask = IC.Builder.CreateBitCast(Mask, MaskTy); 2423 Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0); 2424 // Extract the lowest element from the passthru operand. 2425 Value *Passthru = 2426 IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0); 2427 V = IC.Builder.CreateSelect(Mask, V, Passthru); 2428 } 2429 2430 // Insert the result back into the original argument 0. 2431 V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0); 2432 2433 return IC.replaceInstUsesWith(II, V); 2434 } 2435 } 2436 break; 2437 2438 // Constant fold ashr( <A x Bi>, Ci ). 2439 // Constant fold lshr( <A x Bi>, Ci ). 2440 // Constant fold shl( <A x Bi>, Ci ). 2441 case Intrinsic::x86_sse2_psrai_d: 2442 case Intrinsic::x86_sse2_psrai_w: 2443 case Intrinsic::x86_avx2_psrai_d: 2444 case Intrinsic::x86_avx2_psrai_w: 2445 case Intrinsic::x86_avx512_psrai_q_128: 2446 case Intrinsic::x86_avx512_psrai_q_256: 2447 case Intrinsic::x86_avx512_psrai_d_512: 2448 case Intrinsic::x86_avx512_psrai_q_512: 2449 case Intrinsic::x86_avx512_psrai_w_512: 2450 case Intrinsic::x86_sse2_psrli_d: 2451 case Intrinsic::x86_sse2_psrli_q: 2452 case Intrinsic::x86_sse2_psrli_w: 2453 case Intrinsic::x86_avx2_psrli_d: 2454 case Intrinsic::x86_avx2_psrli_q: 2455 case Intrinsic::x86_avx2_psrli_w: 2456 case Intrinsic::x86_avx512_psrli_d_512: 2457 case Intrinsic::x86_avx512_psrli_q_512: 2458 case Intrinsic::x86_avx512_psrli_w_512: 2459 case Intrinsic::x86_sse2_pslli_d: 2460 case Intrinsic::x86_sse2_pslli_q: 2461 case Intrinsic::x86_sse2_pslli_w: 2462 case Intrinsic::x86_avx2_pslli_d: 2463 case Intrinsic::x86_avx2_pslli_q: 2464 case Intrinsic::x86_avx2_pslli_w: 2465 case Intrinsic::x86_avx512_pslli_d_512: 2466 case Intrinsic::x86_avx512_pslli_q_512: 2467 case Intrinsic::x86_avx512_pslli_w_512: 2468 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2469 return IC.replaceInstUsesWith(II, V); 2470 } 2471 break; 2472 2473 case Intrinsic::x86_sse2_psra_d: 2474 case Intrinsic::x86_sse2_psra_w: 2475 case Intrinsic::x86_avx2_psra_d: 2476 case Intrinsic::x86_avx2_psra_w: 2477 case Intrinsic::x86_avx512_psra_q_128: 2478 case Intrinsic::x86_avx512_psra_q_256: 2479 case Intrinsic::x86_avx512_psra_d_512: 2480 case Intrinsic::x86_avx512_psra_q_512: 2481 case Intrinsic::x86_avx512_psra_w_512: 2482 case Intrinsic::x86_sse2_psrl_d: 2483 case Intrinsic::x86_sse2_psrl_q: 2484 case Intrinsic::x86_sse2_psrl_w: 2485 case Intrinsic::x86_avx2_psrl_d: 2486 case Intrinsic::x86_avx2_psrl_q: 2487 case Intrinsic::x86_avx2_psrl_w: 2488 case Intrinsic::x86_avx512_psrl_d_512: 2489 case Intrinsic::x86_avx512_psrl_q_512: 2490 case Intrinsic::x86_avx512_psrl_w_512: 2491 case Intrinsic::x86_sse2_psll_d: 2492 case Intrinsic::x86_sse2_psll_q: 2493 case Intrinsic::x86_sse2_psll_w: 2494 case Intrinsic::x86_avx2_psll_d: 2495 case Intrinsic::x86_avx2_psll_q: 2496 case Intrinsic::x86_avx2_psll_w: 2497 case Intrinsic::x86_avx512_psll_d_512: 2498 case Intrinsic::x86_avx512_psll_q_512: 2499 case Intrinsic::x86_avx512_psll_w_512: { 2500 if (Value *V = simplifyX86immShift(II, IC.Builder)) { 2501 return IC.replaceInstUsesWith(II, V); 2502 } 2503 2504 // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector 2505 // operand to compute the shift amount. 2506 Value *Arg1 = II.getArgOperand(1); 2507 assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 && 2508 "Unexpected packed shift size"); 2509 unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements(); 2510 2511 if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) { 2512 return IC.replaceOperand(II, 1, V); 2513 } 2514 break; 2515 } 2516 2517 case Intrinsic::x86_avx2_psllv_d: 2518 case Intrinsic::x86_avx2_psllv_d_256: 2519 case Intrinsic::x86_avx2_psllv_q: 2520 case Intrinsic::x86_avx2_psllv_q_256: 2521 case Intrinsic::x86_avx512_psllv_d_512: 2522 case Intrinsic::x86_avx512_psllv_q_512: 2523 case Intrinsic::x86_avx512_psllv_w_128: 2524 case Intrinsic::x86_avx512_psllv_w_256: 2525 case Intrinsic::x86_avx512_psllv_w_512: 2526 case Intrinsic::x86_avx2_psrav_d: 2527 case Intrinsic::x86_avx2_psrav_d_256: 2528 case Intrinsic::x86_avx512_psrav_q_128: 2529 case Intrinsic::x86_avx512_psrav_q_256: 2530 case Intrinsic::x86_avx512_psrav_d_512: 2531 case Intrinsic::x86_avx512_psrav_q_512: 2532 case Intrinsic::x86_avx512_psrav_w_128: 2533 case Intrinsic::x86_avx512_psrav_w_256: 2534 case Intrinsic::x86_avx512_psrav_w_512: 2535 case Intrinsic::x86_avx2_psrlv_d: 2536 case Intrinsic::x86_avx2_psrlv_d_256: 2537 case Intrinsic::x86_avx2_psrlv_q: 2538 case Intrinsic::x86_avx2_psrlv_q_256: 2539 case Intrinsic::x86_avx512_psrlv_d_512: 2540 case Intrinsic::x86_avx512_psrlv_q_512: 2541 case Intrinsic::x86_avx512_psrlv_w_128: 2542 case Intrinsic::x86_avx512_psrlv_w_256: 2543 case Intrinsic::x86_avx512_psrlv_w_512: 2544 if (Value *V = simplifyX86varShift(II, IC.Builder)) { 2545 return IC.replaceInstUsesWith(II, V); 2546 } 2547 break; 2548 2549 case Intrinsic::x86_sse2_packssdw_128: 2550 case Intrinsic::x86_sse2_packsswb_128: 2551 case Intrinsic::x86_avx2_packssdw: 2552 case Intrinsic::x86_avx2_packsswb: 2553 case Intrinsic::x86_avx512_packssdw_512: 2554 case Intrinsic::x86_avx512_packsswb_512: 2555 if (Value *V = simplifyX86pack(II, IC.Builder, true)) { 2556 return IC.replaceInstUsesWith(II, V); 2557 } 2558 break; 2559 2560 case Intrinsic::x86_sse2_packuswb_128: 2561 case Intrinsic::x86_sse41_packusdw: 2562 case Intrinsic::x86_avx2_packusdw: 2563 case Intrinsic::x86_avx2_packuswb: 2564 case Intrinsic::x86_avx512_packusdw_512: 2565 case Intrinsic::x86_avx512_packuswb_512: 2566 if (Value *V = simplifyX86pack(II, IC.Builder, false)) { 2567 return IC.replaceInstUsesWith(II, V); 2568 } 2569 break; 2570 2571 case Intrinsic::x86_sse2_pmadd_wd: 2572 case Intrinsic::x86_avx2_pmadd_wd: 2573 case Intrinsic::x86_avx512_pmaddw_d_512: 2574 if (Value *V = simplifyX86pmadd(II, IC.Builder, true)) { 2575 return IC.replaceInstUsesWith(II, V); 2576 } 2577 break; 2578 2579 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 2580 case Intrinsic::x86_avx2_pmadd_ub_sw: 2581 case Intrinsic::x86_avx512_pmaddubs_w_512: 2582 if (Value *V = simplifyX86pmadd(II, IC.Builder, false)) { 2583 return IC.replaceInstUsesWith(II, V); 2584 } 2585 break; 2586 2587 case Intrinsic::x86_pclmulqdq: 2588 case Intrinsic::x86_pclmulqdq_256: 2589 case Intrinsic::x86_pclmulqdq_512: { 2590 if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) { 2591 unsigned Imm = C->getZExtValue(); 2592 2593 bool MadeChange = false; 2594 Value *Arg0 = II.getArgOperand(0); 2595 Value *Arg1 = II.getArgOperand(1); 2596 unsigned VWidth = 2597 cast<FixedVectorType>(Arg0->getType())->getNumElements(); 2598 2599 APInt UndefElts1(VWidth, 0); 2600 APInt DemandedElts1 = 2601 APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1)); 2602 if (Value *V = 2603 IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) { 2604 IC.replaceOperand(II, 0, V); 2605 MadeChange = true; 2606 } 2607 2608 APInt UndefElts2(VWidth, 0); 2609 APInt DemandedElts2 = 2610 APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1)); 2611 if (Value *V = 2612 IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) { 2613 IC.replaceOperand(II, 1, V); 2614 MadeChange = true; 2615 } 2616 2617 // If either input elements are undef, the result is zero. 2618 if (DemandedElts1.isSubsetOf(UndefElts1) || 2619 DemandedElts2.isSubsetOf(UndefElts2)) { 2620 return IC.replaceInstUsesWith(II, 2621 ConstantAggregateZero::get(II.getType())); 2622 } 2623 2624 if (MadeChange) { 2625 return &II; 2626 } 2627 } 2628 break; 2629 } 2630 2631 case Intrinsic::x86_sse41_insertps: 2632 if (Value *V = simplifyX86insertps(II, IC.Builder)) { 2633 return IC.replaceInstUsesWith(II, V); 2634 } 2635 break; 2636 2637 case Intrinsic::x86_sse4a_extrq: { 2638 Value *Op0 = II.getArgOperand(0); 2639 Value *Op1 = II.getArgOperand(1); 2640 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2641 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2642 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2643 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2644 VWidth1 == 16 && "Unexpected operand sizes"); 2645 2646 // See if we're dealing with constant values. 2647 auto *C1 = dyn_cast<Constant>(Op1); 2648 auto *CILength = 2649 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0)) 2650 : nullptr; 2651 auto *CIIndex = 2652 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2653 : nullptr; 2654 2655 // Attempt to simplify to a constant, shuffle vector or EXTRQI call. 2656 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2657 return IC.replaceInstUsesWith(II, V); 2658 } 2659 2660 // EXTRQ only uses the lowest 64-bits of the first 128-bit vector 2661 // operands and the lowest 16-bits of the second. 2662 bool MadeChange = false; 2663 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2664 IC.replaceOperand(II, 0, V); 2665 MadeChange = true; 2666 } 2667 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) { 2668 IC.replaceOperand(II, 1, V); 2669 MadeChange = true; 2670 } 2671 if (MadeChange) { 2672 return &II; 2673 } 2674 break; 2675 } 2676 2677 case Intrinsic::x86_sse4a_extrqi: { 2678 // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining 2679 // bits of the lower 64-bits. The upper 64-bits are undefined. 2680 Value *Op0 = II.getArgOperand(0); 2681 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2682 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2683 "Unexpected operand size"); 2684 2685 // See if we're dealing with constant values. 2686 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1)); 2687 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2688 2689 // Attempt to simplify to a constant or shuffle vector. 2690 if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) { 2691 return IC.replaceInstUsesWith(II, V); 2692 } 2693 2694 // EXTRQI only uses the lowest 64-bits of the first 128-bit vector 2695 // operand. 2696 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2697 return IC.replaceOperand(II, 0, V); 2698 } 2699 break; 2700 } 2701 2702 case Intrinsic::x86_sse4a_insertq: { 2703 Value *Op0 = II.getArgOperand(0); 2704 Value *Op1 = II.getArgOperand(1); 2705 unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2706 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2707 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 && 2708 cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 && 2709 "Unexpected operand size"); 2710 2711 // See if we're dealing with constant values. 2712 auto *C1 = dyn_cast<Constant>(Op1); 2713 auto *CI11 = 2714 C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1)) 2715 : nullptr; 2716 2717 // Attempt to simplify to a constant, shuffle vector or INSERTQI call. 2718 if (CI11) { 2719 const APInt &V11 = CI11->getValue(); 2720 APInt Len = V11.zextOrTrunc(6); 2721 APInt Idx = V11.lshr(8).zextOrTrunc(6); 2722 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2723 return IC.replaceInstUsesWith(II, V); 2724 } 2725 } 2726 2727 // INSERTQ only uses the lowest 64-bits of the first 128-bit vector 2728 // operand. 2729 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) { 2730 return IC.replaceOperand(II, 0, V); 2731 } 2732 break; 2733 } 2734 2735 case Intrinsic::x86_sse4a_insertqi: { 2736 // INSERTQI: Extract lowest Length bits from lower half of second source and 2737 // insert over first source starting at Index bit. The upper 64-bits are 2738 // undefined. 2739 Value *Op0 = II.getArgOperand(0); 2740 Value *Op1 = II.getArgOperand(1); 2741 unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements(); 2742 unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements(); 2743 assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && 2744 Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 && 2745 VWidth1 == 2 && "Unexpected operand sizes"); 2746 2747 // See if we're dealing with constant values. 2748 auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2)); 2749 auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3)); 2750 2751 // Attempt to simplify to a constant or shuffle vector. 2752 if (CILength && CIIndex) { 2753 APInt Len = CILength->getValue().zextOrTrunc(6); 2754 APInt Idx = CIIndex->getValue().zextOrTrunc(6); 2755 if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) { 2756 return IC.replaceInstUsesWith(II, V); 2757 } 2758 } 2759 2760 // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector 2761 // operands. 2762 bool MadeChange = false; 2763 if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) { 2764 IC.replaceOperand(II, 0, V); 2765 MadeChange = true; 2766 } 2767 if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) { 2768 IC.replaceOperand(II, 1, V); 2769 MadeChange = true; 2770 } 2771 if (MadeChange) { 2772 return &II; 2773 } 2774 break; 2775 } 2776 2777 case Intrinsic::x86_sse41_pblendvb: 2778 case Intrinsic::x86_sse41_blendvps: 2779 case Intrinsic::x86_sse41_blendvpd: 2780 case Intrinsic::x86_avx_blendv_ps_256: 2781 case Intrinsic::x86_avx_blendv_pd_256: 2782 case Intrinsic::x86_avx2_pblendvb: { 2783 // fold (blend A, A, Mask) -> A 2784 Value *Op0 = II.getArgOperand(0); 2785 Value *Op1 = II.getArgOperand(1); 2786 Value *Mask = II.getArgOperand(2); 2787 if (Op0 == Op1) { 2788 return IC.replaceInstUsesWith(II, Op0); 2789 } 2790 2791 // Zero Mask - select 1st argument. 2792 if (isa<ConstantAggregateZero>(Mask)) { 2793 return IC.replaceInstUsesWith(II, Op0); 2794 } 2795 2796 // Constant Mask - select 1st/2nd argument lane based on top bit of mask. 2797 if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) { 2798 Constant *NewSelector = 2799 getNegativeIsTrueBoolVec(ConstantMask, IC.getDataLayout()); 2800 return SelectInst::Create(NewSelector, Op1, Op0, "blendv"); 2801 } 2802 2803 // Peek through a one-use shuffle - VectorCombine should have simplified 2804 // this for cases where we're splitting wider vectors to use blendv 2805 // intrinsics. 2806 Value *MaskSrc = nullptr; 2807 ArrayRef<int> ShuffleMask; 2808 if (match(Mask, PatternMatch::m_OneUse(PatternMatch::m_Shuffle( 2809 PatternMatch::m_Value(MaskSrc), PatternMatch::m_Undef(), 2810 PatternMatch::m_Mask(ShuffleMask))))) { 2811 // Bail if the shuffle was irregular or contains undefs. 2812 int NumElts = cast<FixedVectorType>(MaskSrc->getType())->getNumElements(); 2813 if (NumElts < (int)ShuffleMask.size() || !isPowerOf2_32(NumElts) || 2814 any_of(ShuffleMask, 2815 [NumElts](int M) { return M < 0 || M >= NumElts; })) 2816 break; 2817 Mask = MaskSrc; 2818 } 2819 2820 // Convert to a vector select if we can bypass casts and find a boolean 2821 // vector condition value. 2822 Value *BoolVec; 2823 Mask = InstCombiner::peekThroughBitcast(Mask); 2824 if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) && 2825 BoolVec->getType()->isVectorTy() && 2826 BoolVec->getType()->getScalarSizeInBits() == 1) { 2827 auto *MaskTy = cast<FixedVectorType>(Mask->getType()); 2828 auto *OpTy = cast<FixedVectorType>(II.getType()); 2829 unsigned NumMaskElts = MaskTy->getNumElements(); 2830 unsigned NumOperandElts = OpTy->getNumElements(); 2831 2832 // If we peeked through a shuffle, reapply the shuffle to the bool vector. 2833 if (MaskSrc) { 2834 unsigned NumMaskSrcElts = 2835 cast<FixedVectorType>(MaskSrc->getType())->getNumElements(); 2836 NumMaskElts = (ShuffleMask.size() * NumMaskElts) / NumMaskSrcElts; 2837 // Multiple mask bits maps to the same operand element - bail out. 2838 if (NumMaskElts > NumOperandElts) 2839 break; 2840 SmallVector<int> ScaledMask; 2841 if (!llvm::scaleShuffleMaskElts(NumMaskElts, ShuffleMask, ScaledMask)) 2842 break; 2843 BoolVec = IC.Builder.CreateShuffleVector(BoolVec, ScaledMask); 2844 MaskTy = FixedVectorType::get(MaskTy->getElementType(), NumMaskElts); 2845 } 2846 assert(MaskTy->getPrimitiveSizeInBits() == 2847 OpTy->getPrimitiveSizeInBits() && 2848 "Not expecting mask and operands with different sizes"); 2849 2850 if (NumMaskElts == NumOperandElts) { 2851 return SelectInst::Create(BoolVec, Op1, Op0); 2852 } 2853 2854 // If the mask has less elements than the operands, each mask bit maps to 2855 // multiple elements of the operands. Bitcast back and forth. 2856 if (NumMaskElts < NumOperandElts) { 2857 Value *CastOp0 = IC.Builder.CreateBitCast(Op0, MaskTy); 2858 Value *CastOp1 = IC.Builder.CreateBitCast(Op1, MaskTy); 2859 Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0); 2860 return new BitCastInst(Sel, II.getType()); 2861 } 2862 } 2863 2864 break; 2865 } 2866 2867 case Intrinsic::x86_ssse3_pshuf_b_128: 2868 case Intrinsic::x86_avx2_pshuf_b: 2869 case Intrinsic::x86_avx512_pshuf_b_512: 2870 if (Value *V = simplifyX86pshufb(II, IC.Builder)) { 2871 return IC.replaceInstUsesWith(II, V); 2872 } 2873 break; 2874 2875 case Intrinsic::x86_avx_vpermilvar_ps: 2876 case Intrinsic::x86_avx_vpermilvar_ps_256: 2877 case Intrinsic::x86_avx512_vpermilvar_ps_512: 2878 case Intrinsic::x86_avx_vpermilvar_pd: 2879 case Intrinsic::x86_avx_vpermilvar_pd_256: 2880 case Intrinsic::x86_avx512_vpermilvar_pd_512: 2881 if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) { 2882 return IC.replaceInstUsesWith(II, V); 2883 } 2884 break; 2885 2886 case Intrinsic::x86_avx2_permd: 2887 case Intrinsic::x86_avx2_permps: 2888 case Intrinsic::x86_avx512_permvar_df_256: 2889 case Intrinsic::x86_avx512_permvar_df_512: 2890 case Intrinsic::x86_avx512_permvar_di_256: 2891 case Intrinsic::x86_avx512_permvar_di_512: 2892 case Intrinsic::x86_avx512_permvar_hi_128: 2893 case Intrinsic::x86_avx512_permvar_hi_256: 2894 case Intrinsic::x86_avx512_permvar_hi_512: 2895 case Intrinsic::x86_avx512_permvar_qi_128: 2896 case Intrinsic::x86_avx512_permvar_qi_256: 2897 case Intrinsic::x86_avx512_permvar_qi_512: 2898 case Intrinsic::x86_avx512_permvar_sf_512: 2899 case Intrinsic::x86_avx512_permvar_si_512: 2900 if (Value *V = simplifyX86vpermv(II, IC.Builder)) { 2901 return IC.replaceInstUsesWith(II, V); 2902 } 2903 break; 2904 2905 case Intrinsic::x86_avx512_vpermi2var_d_128: 2906 case Intrinsic::x86_avx512_vpermi2var_d_256: 2907 case Intrinsic::x86_avx512_vpermi2var_d_512: 2908 case Intrinsic::x86_avx512_vpermi2var_hi_128: 2909 case Intrinsic::x86_avx512_vpermi2var_hi_256: 2910 case Intrinsic::x86_avx512_vpermi2var_hi_512: 2911 case Intrinsic::x86_avx512_vpermi2var_pd_128: 2912 case Intrinsic::x86_avx512_vpermi2var_pd_256: 2913 case Intrinsic::x86_avx512_vpermi2var_pd_512: 2914 case Intrinsic::x86_avx512_vpermi2var_ps_128: 2915 case Intrinsic::x86_avx512_vpermi2var_ps_256: 2916 case Intrinsic::x86_avx512_vpermi2var_ps_512: 2917 case Intrinsic::x86_avx512_vpermi2var_q_128: 2918 case Intrinsic::x86_avx512_vpermi2var_q_256: 2919 case Intrinsic::x86_avx512_vpermi2var_q_512: 2920 case Intrinsic::x86_avx512_vpermi2var_qi_128: 2921 case Intrinsic::x86_avx512_vpermi2var_qi_256: 2922 case Intrinsic::x86_avx512_vpermi2var_qi_512: 2923 if (Value *V = simplifyX86vpermv3(II, IC.Builder)) { 2924 return IC.replaceInstUsesWith(II, V); 2925 } 2926 break; 2927 2928 case Intrinsic::x86_avx_maskload_ps: 2929 case Intrinsic::x86_avx_maskload_pd: 2930 case Intrinsic::x86_avx_maskload_ps_256: 2931 case Intrinsic::x86_avx_maskload_pd_256: 2932 case Intrinsic::x86_avx2_maskload_d: 2933 case Intrinsic::x86_avx2_maskload_q: 2934 case Intrinsic::x86_avx2_maskload_d_256: 2935 case Intrinsic::x86_avx2_maskload_q_256: 2936 if (Instruction *I = simplifyX86MaskedLoad(II, IC)) { 2937 return I; 2938 } 2939 break; 2940 2941 case Intrinsic::x86_sse2_maskmov_dqu: 2942 case Intrinsic::x86_avx_maskstore_ps: 2943 case Intrinsic::x86_avx_maskstore_pd: 2944 case Intrinsic::x86_avx_maskstore_ps_256: 2945 case Intrinsic::x86_avx_maskstore_pd_256: 2946 case Intrinsic::x86_avx2_maskstore_d: 2947 case Intrinsic::x86_avx2_maskstore_q: 2948 case Intrinsic::x86_avx2_maskstore_d_256: 2949 case Intrinsic::x86_avx2_maskstore_q_256: 2950 if (simplifyX86MaskedStore(II, IC)) { 2951 return nullptr; 2952 } 2953 break; 2954 2955 case Intrinsic::x86_addcarry_32: 2956 case Intrinsic::x86_addcarry_64: 2957 if (Value *V = simplifyX86addcarry(II, IC.Builder)) { 2958 return IC.replaceInstUsesWith(II, V); 2959 } 2960 break; 2961 2962 case Intrinsic::x86_avx512_pternlog_d_128: 2963 case Intrinsic::x86_avx512_pternlog_d_256: 2964 case Intrinsic::x86_avx512_pternlog_d_512: 2965 case Intrinsic::x86_avx512_pternlog_q_128: 2966 case Intrinsic::x86_avx512_pternlog_q_256: 2967 case Intrinsic::x86_avx512_pternlog_q_512: 2968 if (Value *V = simplifyTernarylogic(II, IC.Builder)) { 2969 return IC.replaceInstUsesWith(II, V); 2970 } 2971 break; 2972 default: 2973 break; 2974 } 2975 return std::nullopt; 2976 } 2977 2978 std::optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic( 2979 InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, 2980 bool &KnownBitsComputed) const { 2981 switch (II.getIntrinsicID()) { 2982 default: 2983 break; 2984 case Intrinsic::x86_mmx_pmovmskb: 2985 case Intrinsic::x86_sse_movmsk_ps: 2986 case Intrinsic::x86_sse2_movmsk_pd: 2987 case Intrinsic::x86_sse2_pmovmskb_128: 2988 case Intrinsic::x86_avx_movmsk_ps_256: 2989 case Intrinsic::x86_avx_movmsk_pd_256: 2990 case Intrinsic::x86_avx2_pmovmskb: { 2991 // MOVMSK copies the vector elements' sign bits to the low bits 2992 // and zeros the high bits. 2993 unsigned ArgWidth; 2994 if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) { 2995 ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>. 2996 } else { 2997 auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType()); 2998 ArgWidth = ArgType->getNumElements(); 2999 } 3000 3001 // If we don't need any of low bits then return zero, 3002 // we know that DemandedMask is non-zero already. 3003 APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth); 3004 Type *VTy = II.getType(); 3005 if (DemandedElts.isZero()) { 3006 return ConstantInt::getNullValue(VTy); 3007 } 3008 3009 // We know that the upper bits are set to zero. 3010 Known.Zero.setBitsFrom(ArgWidth); 3011 KnownBitsComputed = true; 3012 break; 3013 } 3014 } 3015 return std::nullopt; 3016 } 3017 3018 std::optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic( 3019 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 3020 APInt &UndefElts2, APInt &UndefElts3, 3021 std::function<void(Instruction *, unsigned, APInt, APInt &)> 3022 simplifyAndSetOp) const { 3023 unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements(); 3024 switch (II.getIntrinsicID()) { 3025 default: 3026 break; 3027 case Intrinsic::x86_xop_vfrcz_ss: 3028 case Intrinsic::x86_xop_vfrcz_sd: 3029 // The instructions for these intrinsics are speced to zero upper bits not 3030 // pass them through like other scalar intrinsics. So we shouldn't just 3031 // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics. 3032 // Instead we should return a zero vector. 3033 if (!DemandedElts[0]) { 3034 IC.addToWorklist(&II); 3035 return ConstantAggregateZero::get(II.getType()); 3036 } 3037 3038 // Only the lower element is used. 3039 DemandedElts = 1; 3040 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3041 3042 // Only the lower element is undefined. The high elements are zero. 3043 UndefElts = UndefElts[0]; 3044 break; 3045 3046 // Unary scalar-as-vector operations that work column-wise. 3047 case Intrinsic::x86_sse_rcp_ss: 3048 case Intrinsic::x86_sse_rsqrt_ss: 3049 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3050 3051 // If lowest element of a scalar op isn't used then use Arg0. 3052 if (!DemandedElts[0]) { 3053 IC.addToWorklist(&II); 3054 return II.getArgOperand(0); 3055 } 3056 // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions 3057 // checks). 3058 break; 3059 3060 // Binary scalar-as-vector operations that work column-wise. The high 3061 // elements come from operand 0. The low element is a function of both 3062 // operands. 3063 case Intrinsic::x86_sse_min_ss: 3064 case Intrinsic::x86_sse_max_ss: 3065 case Intrinsic::x86_sse_cmp_ss: 3066 case Intrinsic::x86_sse2_min_sd: 3067 case Intrinsic::x86_sse2_max_sd: 3068 case Intrinsic::x86_sse2_cmp_sd: { 3069 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3070 3071 // If lowest element of a scalar op isn't used then use Arg0. 3072 if (!DemandedElts[0]) { 3073 IC.addToWorklist(&II); 3074 return II.getArgOperand(0); 3075 } 3076 3077 // Only lower element is used for operand 1. 3078 DemandedElts = 1; 3079 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3080 3081 // Lower element is undefined if both lower elements are undefined. 3082 // Consider things like undef&0. The result is known zero, not undef. 3083 if (!UndefElts2[0]) 3084 UndefElts.clearBit(0); 3085 3086 break; 3087 } 3088 3089 // Binary scalar-as-vector operations that work column-wise. The high 3090 // elements come from operand 0 and the low element comes from operand 1. 3091 case Intrinsic::x86_sse41_round_ss: 3092 case Intrinsic::x86_sse41_round_sd: { 3093 // Don't use the low element of operand 0. 3094 APInt DemandedElts2 = DemandedElts; 3095 DemandedElts2.clearBit(0); 3096 simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts); 3097 3098 // If lowest element of a scalar op isn't used then use Arg0. 3099 if (!DemandedElts[0]) { 3100 IC.addToWorklist(&II); 3101 return II.getArgOperand(0); 3102 } 3103 3104 // Only lower element is used for operand 1. 3105 DemandedElts = 1; 3106 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3107 3108 // Take the high undef elements from operand 0 and take the lower element 3109 // from operand 1. 3110 UndefElts.clearBit(0); 3111 UndefElts |= UndefElts2[0]; 3112 break; 3113 } 3114 3115 // Three input scalar-as-vector operations that work column-wise. The high 3116 // elements come from operand 0 and the low element is a function of all 3117 // three inputs. 3118 case Intrinsic::x86_avx512_mask_add_ss_round: 3119 case Intrinsic::x86_avx512_mask_div_ss_round: 3120 case Intrinsic::x86_avx512_mask_mul_ss_round: 3121 case Intrinsic::x86_avx512_mask_sub_ss_round: 3122 case Intrinsic::x86_avx512_mask_max_ss_round: 3123 case Intrinsic::x86_avx512_mask_min_ss_round: 3124 case Intrinsic::x86_avx512_mask_add_sd_round: 3125 case Intrinsic::x86_avx512_mask_div_sd_round: 3126 case Intrinsic::x86_avx512_mask_mul_sd_round: 3127 case Intrinsic::x86_avx512_mask_sub_sd_round: 3128 case Intrinsic::x86_avx512_mask_max_sd_round: 3129 case Intrinsic::x86_avx512_mask_min_sd_round: 3130 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3131 3132 // If lowest element of a scalar op isn't used then use Arg0. 3133 if (!DemandedElts[0]) { 3134 IC.addToWorklist(&II); 3135 return II.getArgOperand(0); 3136 } 3137 3138 // Only lower element is used for operand 1 and 2. 3139 DemandedElts = 1; 3140 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3141 simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3); 3142 3143 // Lower element is undefined if all three lower elements are undefined. 3144 // Consider things like undef&0. The result is known zero, not undef. 3145 if (!UndefElts2[0] || !UndefElts3[0]) 3146 UndefElts.clearBit(0); 3147 break; 3148 3149 // TODO: Add fmaddsub support? 3150 case Intrinsic::x86_sse3_addsub_pd: 3151 case Intrinsic::x86_sse3_addsub_ps: 3152 case Intrinsic::x86_avx_addsub_pd_256: 3153 case Intrinsic::x86_avx_addsub_ps_256: { 3154 // If none of the even or none of the odd lanes are required, turn this 3155 // into a generic FP math instruction. 3156 APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1)); 3157 APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2)); 3158 bool IsSubOnly = DemandedElts.isSubsetOf(SubMask); 3159 bool IsAddOnly = DemandedElts.isSubsetOf(AddMask); 3160 if (IsSubOnly || IsAddOnly) { 3161 assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only"); 3162 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 3163 IC.Builder.SetInsertPoint(&II); 3164 Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1); 3165 return IC.Builder.CreateBinOp( 3166 IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1); 3167 } 3168 3169 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3170 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3171 UndefElts &= UndefElts2; 3172 break; 3173 } 3174 3175 // General per-element vector operations. 3176 case Intrinsic::x86_avx2_psllv_d: 3177 case Intrinsic::x86_avx2_psllv_d_256: 3178 case Intrinsic::x86_avx2_psllv_q: 3179 case Intrinsic::x86_avx2_psllv_q_256: 3180 case Intrinsic::x86_avx2_psrlv_d: 3181 case Intrinsic::x86_avx2_psrlv_d_256: 3182 case Intrinsic::x86_avx2_psrlv_q: 3183 case Intrinsic::x86_avx2_psrlv_q_256: 3184 case Intrinsic::x86_avx2_psrav_d: 3185 case Intrinsic::x86_avx2_psrav_d_256: { 3186 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3187 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3188 UndefElts &= UndefElts2; 3189 break; 3190 } 3191 3192 case Intrinsic::x86_sse2_pmulh_w: 3193 case Intrinsic::x86_avx2_pmulh_w: 3194 case Intrinsic::x86_avx512_pmulh_w_512: 3195 case Intrinsic::x86_sse2_pmulhu_w: 3196 case Intrinsic::x86_avx2_pmulhu_w: 3197 case Intrinsic::x86_avx512_pmulhu_w_512: 3198 case Intrinsic::x86_ssse3_pmul_hr_sw_128: 3199 case Intrinsic::x86_avx2_pmul_hr_sw: 3200 case Intrinsic::x86_avx512_pmul_hr_sw_512: { 3201 simplifyAndSetOp(&II, 0, DemandedElts, UndefElts); 3202 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2); 3203 // NOTE: mulh(undef,undef) != undef. 3204 break; 3205 } 3206 3207 case Intrinsic::x86_sse2_packssdw_128: 3208 case Intrinsic::x86_sse2_packsswb_128: 3209 case Intrinsic::x86_sse2_packuswb_128: 3210 case Intrinsic::x86_sse41_packusdw: 3211 case Intrinsic::x86_avx2_packssdw: 3212 case Intrinsic::x86_avx2_packsswb: 3213 case Intrinsic::x86_avx2_packusdw: 3214 case Intrinsic::x86_avx2_packuswb: 3215 case Intrinsic::x86_avx512_packssdw_512: 3216 case Intrinsic::x86_avx512_packsswb_512: 3217 case Intrinsic::x86_avx512_packusdw_512: 3218 case Intrinsic::x86_avx512_packuswb_512: { 3219 auto *Ty0 = II.getArgOperand(0)->getType(); 3220 unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements(); 3221 assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); 3222 3223 unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; 3224 unsigned VWidthPerLane = VWidth / NumLanes; 3225 unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; 3226 3227 // Per lane, pack the elements of the first input and then the second. 3228 // e.g. 3229 // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) 3230 // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) 3231 for (int OpNum = 0; OpNum != 2; ++OpNum) { 3232 APInt OpDemandedElts(InnerVWidth, 0); 3233 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3234 unsigned LaneIdx = Lane * VWidthPerLane; 3235 for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { 3236 unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; 3237 if (DemandedElts[Idx]) 3238 OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); 3239 } 3240 } 3241 3242 // Demand elements from the operand. 3243 APInt OpUndefElts(InnerVWidth, 0); 3244 simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts); 3245 3246 // Pack the operand's UNDEF elements, one lane at a time. 3247 OpUndefElts = OpUndefElts.zext(VWidth); 3248 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { 3249 APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); 3250 LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); 3251 LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum); 3252 UndefElts |= LaneElts; 3253 } 3254 } 3255 break; 3256 } 3257 3258 case Intrinsic::x86_sse2_pmadd_wd: 3259 case Intrinsic::x86_avx2_pmadd_wd: 3260 case Intrinsic::x86_avx512_pmaddw_d_512: 3261 case Intrinsic::x86_ssse3_pmadd_ub_sw_128: 3262 case Intrinsic::x86_avx2_pmadd_ub_sw: 3263 case Intrinsic::x86_avx512_pmaddubs_w_512: { 3264 // PMADD - demand both src elements that map to each dst element. 3265 auto *ArgTy = II.getArgOperand(0)->getType(); 3266 unsigned InnerVWidth = cast<FixedVectorType>(ArgTy)->getNumElements(); 3267 assert((VWidth * 2) == InnerVWidth && "Unexpected input size"); 3268 APInt OpDemandedElts = APIntOps::ScaleBitMask(DemandedElts, InnerVWidth); 3269 APInt Op0UndefElts(InnerVWidth, 0); 3270 APInt Op1UndefElts(InnerVWidth, 0); 3271 simplifyAndSetOp(&II, 0, OpDemandedElts, Op0UndefElts); 3272 simplifyAndSetOp(&II, 1, OpDemandedElts, Op1UndefElts); 3273 // NOTE: madd(undef,undef) != undef. 3274 break; 3275 } 3276 3277 // PSHUFB 3278 case Intrinsic::x86_ssse3_pshuf_b_128: 3279 case Intrinsic::x86_avx2_pshuf_b: 3280 case Intrinsic::x86_avx512_pshuf_b_512: 3281 // PERMILVAR 3282 case Intrinsic::x86_avx_vpermilvar_ps: 3283 case Intrinsic::x86_avx_vpermilvar_ps_256: 3284 case Intrinsic::x86_avx512_vpermilvar_ps_512: 3285 case Intrinsic::x86_avx_vpermilvar_pd: 3286 case Intrinsic::x86_avx_vpermilvar_pd_256: 3287 case Intrinsic::x86_avx512_vpermilvar_pd_512: 3288 // PERMV 3289 case Intrinsic::x86_avx2_permd: 3290 case Intrinsic::x86_avx2_permps: { 3291 simplifyAndSetOp(&II, 1, DemandedElts, UndefElts); 3292 break; 3293 } 3294 3295 // SSE4A instructions leave the upper 64-bits of the 128-bit result 3296 // in an undefined state. 3297 case Intrinsic::x86_sse4a_extrq: 3298 case Intrinsic::x86_sse4a_extrqi: 3299 case Intrinsic::x86_sse4a_insertq: 3300 case Intrinsic::x86_sse4a_insertqi: 3301 UndefElts.setHighBits(VWidth / 2); 3302 break; 3303 } 3304 return std::nullopt; 3305 } 3306