Lines Matching +full:2 +full:- +full:point

1 //=- X86ScheduleZnver3.td - X86 Znver3 Scheduling ------------*- tablegen -*-=//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
18 //===----------------------------------------------------------------------===//
27 // outstanding operations (integer, load/store, and floating-point) and is
30 // to 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode.
34 // At each set-way intersection is an entry containing up to 8 macro ops.
37 // the op-cache, we limit the loop buffer to 8*12 = 96 to avoid loop unrolling
38 // leading to excessive filling of the op-cache from frontend.
40 // AMD SOG 19h, 2.6.2 L1 Data Cache
41 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
44 // <...> and can achieve 4-cycle load-to-use integer load latency.
48 // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
67 //===----------------------------------------------------------------------===//
69 //===----------------------------------------------------------------------===//
73 // 256 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
74 // The retire unit handles in-order commit of up to eight macro ops per cycle.
77 //===----------------------------------------------------------------------===//
79 //===----------------------------------------------------------------------===//
83 //===----------------------------------------------------------------------===//
93 //===----------------------------------------------------------------------===//
95 // AMD SOG 19h, 2.10.2 Execution Units
103 // AMD SOG 19h, 2.10.2 Execution Units
107 // AMD SOG 19h, 2.10.2 Execution Units
117 //===----------------------------------------------------------------------===//
119 // AMD SOG 19h, 2.10.2 Execution Units
123 // AMD SOG 19h, 2.10.2 Execution Units
131 //===----------------------------------------------------------------------===//
154 //===----------------------------------------------------------------------===//
170 Zn3ALU2, Zn3AGU2, // scheduler 2
177 //===----------------------------------------------------------------------===//
178 // Floating-Point Unit
182 // The processor uses <...> two decoupled independent floating point schedulers
183 // each servicing two FP pipelines and one store or FP-to-integer pipeline.
187 //===----------------------------------------------------------------------===//
191 // Agner, 22.10 Floating point execution pipes
192 // There are six floating point/vector execution pipes,
197 def Zn3FP45 : ProcResource<2>;
201 //===----------------------------------------------------------------------===//
202 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
204 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
217 // AMD SOG 19h, 2.11.1 Floating Point Execution Resources
218 // FDIV unit can support 2 simultaneous operations in flight
220 // FIXME: BufferSize=2 ?
223 // Moves and Logical operations on Floating Point Data Types
264 //===----------------------------------------------------------------------===//
266 // AMD SOG 19h, 2.11 Floating-Point Unit
267 // Stores and floating point to general purpose register transfer
268 // have 2 dedicated pipelines (pipe 5 and 6).
271 // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
284 // Moves and Logical operations on Floating Point Data Types
290 // AMD SOG 19h, 2.11 Floating-Point Unit
291 // Stores and floating point to general purpose register transfer
292 // have 2 dedicated pipelines (pipe 5 and 6).
295 // AMD SOG 19h, 2.11 Floating-Point Unit
330 //===----------------------------------------------------------------------===//
332 // Agner, 21.8 Register renaming and out-of-order schedulers
333 // The floating point register file has 160 vector registers
334 // of 128 bits each in Zen 1 and 256 bits each in Zen 2.
340 // AMD SOG 19h, 2.11 Floating-Point Unit
341 // The floating-point scheduler has a 2*32 entry macro op capacity.
342 // AMD SOG 19h, 2.11 Floating-Point Unit
348 let BufferSize = !mul(2, 32);
351 // AMD SOG 19h, 2.11 Floating-Point Unit
353 // even if floating-point scheduler is full.
357 //===----------------------------------------------------------------------===//
358 // Load-Store Unit
361 // AMD SOG 19h, 2.12 Load-Store Unit
362 // The LS unit contains three largely independent pipe-lines
363 // enabling the execution of three 256-bit memory operations per cycle.
366 // AMD SOG 19h, 2.12 Load-Store Unit
370 // AMD SOG 19h, 2.12 Load-Store Unit
371 // The LS unit can process up to 72 out-of-order loads.
377 // AMD SOG 19h, 2.12 Load-Store Unit
380 def Zn3Store : ProcResource<2> {
381 // AMD SOG 19h, 2.12 Load-Store Unit
382 // The LS unit utilizes a 64-entry store queue (STQ).
388 //===----------------------------------------------------------------------===//
390 //===----------------------------------------------------------------------===//
393 // Instructions with folded loads are usually micro-fused, so they only appear
394 // as two micro-ops when dispatched by the schedulers.
473 //===----------------------------------------------------------------------===//
475 //===----------------------------------------------------------------------===//
483 // AMD SOG 19h, 2.11 Floating-Point Unit
486 def : ReadAdvance<ReadInt2Fpu, -1>;
495 // Model the effect of clobbering the read-write mask operand of the GATHER operation.
506 defm : Zn3WriteResInt<WriteStore, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
507 defm : Zn3WriteResInt<WriteStoreNT, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency, [1, 2], 1>;
523 let NumMicroOps = 2;
550 let ReleaseAtCycles = [2];
577 let Latency = 2;
579 let NumMicroOps = 2;
586 // A 3-operand LEA (base, index, offset).
590 CheckIsImmOperand<2>,
591 CheckNot<CheckImmOperand<2, 1>>
604 let Latency = 2; // FIXME: not from llvm-exegesis
606 let NumMicroOps = 2;
612 defm : Zn3WriteResIntPair<WriteIMul8, [Zn3Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
613 defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
614 defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
615 defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
616 defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>; // Integer 32-bit multiplication.
617 defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
618 defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
619 defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
620 defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>; // Integer 64-bit multiplication.
621 defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 3, [1], 2>; // Integer 32-bit Unsigned Multiply Without Affecting Flags.
622 defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
623 defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
627 defm : Zn3WriteResInt<WriteBSWAP32, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
628 defm : Zn3WriteResInt<WriteBSWAP64, [Zn3ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
644 let NumMicroOps = !add(Zn3WriteCMPXCHG8rr.NumMicroOps, 2);
649 let Latency = 3; // FIXME: not from llvm-exegesis
656 let Latency = 4; // FIXME: not from llvm-exegesis
664 let ReleaseAtCycles = [2];
665 let NumMicroOps = 2;
670 let Latency = !add(Znver3Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
671 let ReleaseAtCycles = [1, 1, 2];
677 let Latency = !add(Znver3Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
678 let ReleaseAtCycles = [1, 1, 2];
679 let NumMicroOps = 2;
684 // FIXME: uops for 8-bit division measures as 2. for others it's a guess.
685 // FIXME: latency for 8-bit division measures as 10. for others it's a guess.
686 defm : Zn3WriteResIntPair<WriteDiv8, [Zn3Divider], 10, [10], 2>;
687 defm : Zn3WriteResIntPair<WriteDiv16, [Zn3Divider], 11, [11], 2>;
688 defm : Zn3WriteResIntPair<WriteDiv32, [Zn3Divider], 13, [13], 2>;
689 defm : Zn3WriteResIntPair<WriteDiv64, [Zn3Divider], 17, [17], 2>;
690 defm : Zn3WriteResIntPair<WriteIDiv8, [Zn3Divider], 10, [10], 2>;
691 defm : Zn3WriteResIntPair<WriteIDiv16, [Zn3Divider], 11, [11], 2>;
692 defm : Zn3WriteResIntPair<WriteIDiv32, [Zn3Divider], 13, [13], 2>;
693 defm : Zn3WriteResIntPair<WriteIDiv64, [Zn3Divider], 17, [17], 2>;
695 defm : Zn3WriteResIntPair<WriteBSF, [Zn3ALU1], 3, [3], 6, /*LoadUOps=*/2>; // Bit scan forward.
696 defm : Zn3WriteResIntPair<WriteBSR, [Zn3ALU1], 4, [4], 6, /*LoadUOps=*/2>; // Bit scan reverse.
716 defm : Zn3WriteResIntPair<WriteTZCNT, [Zn3ALU12], 2, [1], 2>; // Trailing zero count.
719 let Latency = 2;
721 let NumMicroOps = 2;
726 defm : Zn3WriteResInt<WriteFCMOV, [Zn3ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
727 defm : Zn3WriteResInt<WriteSETCC, [Zn3ALU03], 1, [2], 1>; // Set register based on condition code.
728 defm : Zn3WriteResInt<WriteSETCCStore, [Zn3ALU03, Zn3AGU012, Zn3Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
732 defm : Zn3WriteResInt<WriteBitTestImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 1), [1, 1, 1], 2>;
735 defm : Zn3WriteResInt<WriteBitTestSet, [Zn3ALU12], 2, [2], 2>; // Bit Test + Set
736 defm : Zn3WriteResInt<WriteBitTestSetImmLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 4>;
737 defm : Zn3WriteResInt<WriteBitTestSetRegLd, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 1], 9>;
746 let ReleaseAtCycles = [2];
754 let ReleaseAtCycles = [1, 1, 2];
784 let NumMicroOps = !add(Zn3WriteRotateLeftRI.NumMicroOps, 2);
800 let NumMicroOps = !add(Zn3WriteRotateRightRCL.NumMicroOps, 2);
814 let NumMicroOps = !add(Zn3WriteRotateLeftRCL.NumMicroOps, 2);
819 defm : Zn3WriteResInt<WriteSHDrri, [Zn3ALU12], 2, [3], 4>;
820 defm : Zn3WriteResInt<WriteSHDrrcl, [Zn3ALU12], 2, [3], 5>;
821 defm : Zn3WriteResInt<WriteSHDmri, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
822 defm : Zn3WriteResInt<WriteSHDmrcl, [Zn3AGU012, Zn3Load, Zn3ALU12], !add(Znver3Model.LoadLatency, 2), [1, 1, 4], 6>;
826 defm : Zn3WriteResIntPair<WriteBLS, [Zn3ALU0123], 2, [2], 2, /*LoadUOps=*/1>;
835 defm : Zn3WriteResIntPair<WriteJump, [Zn3BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
837 // Floating point. This covers both scalar and vector operations.
849 let Latency = 2; // FIXME: not from llvm-exegesis
851 let NumMicroOps = 2;
867 defm : Zn3WriteResXMMPair<WriteFAdd, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub.
870 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
872 let NumMicroOps = 2;
880 let Latency = !add(Znver3Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
882 let NumMicroOps = 2;
887 defm : Zn3WriteResXMMPair<WriteFAddX, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
888 defm : Zn3WriteResYMMPair<WriteFAddY, [Zn3FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
889 defm : X86WriteResPairUnsupported<WriteFAddZ>; // Floating point add/sub (ZMM).
890 defm : Zn3WriteResXMMPair<WriteFAdd64, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub.
891 defm : Zn3WriteResXMMPair<WriteFAdd64X, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
892 defm : Zn3WriteResYMMPair<WriteFAdd64Y, [Zn3FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
893 defm : X86WriteResPairUnsupported<WriteFAdd64Z>; // Floating point double add/sub (ZMM).
894 defm : Zn3WriteResXMMPair<WriteFCmp, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare.
895 defm : Zn3WriteResXMMPair<WriteFCmpX, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (XMM).
896 defm : Zn3WriteResYMMPair<WriteFCmpY, [Zn3FPFMul01], 1, [1], 1>; // Floating point compare (YMM).
897 defm : X86WriteResPairUnsupported<WriteFCmpZ>; // Floating point compare (ZMM).
898 defm : Zn3WriteResXMMPair<WriteFCmp64, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare.
899 defm : Zn3WriteResXMMPair<WriteFCmp64X, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (XMM).
900 defm : Zn3WriteResYMMPair<WriteFCmp64Y, [Zn3FPFMul01], 1, [1], 1>; // Floating point double compare (YMM).
901 defm : X86WriteResPairUnsupported<WriteFCmp64Z>; // Floating point double compare (ZMM).
902 defm : Zn3WriteResXMMPair<WriteFCom, [Zn3FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (X87).
903 defm : Zn3WriteResXMMPair<WriteFComX, [Zn3FPFMul01], 4, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
904 defm : Zn3WriteResXMMPair<WriteFMul, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication.
905 defm : Zn3WriteResXMMPair<WriteFMulX, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
906 defm : Zn3WriteResYMMPair<WriteFMulY, [Zn3FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
907 defm : X86WriteResPairUnsupported<WriteFMulZ>; // Floating point multiplication (YMM).
908 defm : Zn3WriteResXMMPair<WriteFMul64, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication.
909 defm : Zn3WriteResXMMPair<WriteFMul64X, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
910 defm : Zn3WriteResYMMPair<WriteFMul64Y, [Zn3FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
911 defm : X86WriteResPairUnsupported<WriteFMul64Z>; // Floating point double multiplication (ZMM).
912 defm : Zn3WriteResXMMPair<WriteFDiv, [Zn3FPFDiv], 11, [3], 1>; // Floating point division.
913 defm : Zn3WriteResXMMPair<WriteFDivX, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (XMM).
914 defm : Zn3WriteResYMMPair<WriteFDivY, [Zn3FPFDiv], 11, [3], 1>; // Floating point division (YMM).
915 defm : X86WriteResPairUnsupported<WriteFDivZ>; // Floating point division (ZMM).
916 defm : Zn3WriteResXMMPair<WriteFDiv64, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division.
917 defm : Zn3WriteResXMMPair<WriteFDiv64X, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
918 defm : Zn3WriteResYMMPair<WriteFDiv64Y, [Zn3FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
919 defm : X86WriteResPairUnsupported<WriteFDiv64Z>; // Floating point double division (ZMM).
920 defm : Zn3WriteResXMMPair<WriteFSqrt, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root.
921 defm : Zn3WriteResXMMPair<WriteFSqrtX, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (XMM).
922 defm : Zn3WriteResYMMPair<WriteFSqrtY, [Zn3FPFDiv], 15, [5], 1>; // Floating point square root (YMM).
923 defm : X86WriteResPairUnsupported<WriteFSqrtZ>; // Floating point square root (ZMM).
924 defm : Zn3WriteResXMMPair<WriteFSqrt64, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root.
925 defm : Zn3WriteResXMMPair<WriteFSqrt64X, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
926 defm : Zn3WriteResYMMPair<WriteFSqrt64Y, [Zn3FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
927 defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; // Floating point double square root (ZMM).
928 defm : Zn3WriteResXMMPair<WriteFSqrt80, [Zn3FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis // Floating point long double square root.
929 defm : Zn3WriteResXMMPair<WriteFRcp, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate.
930 defm : Zn3WriteResXMMPair<WriteFRcpX, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (XMM).
931 defm : Zn3WriteResYMMPair<WriteFRcpY, [Zn3FPFMul01], 3, [1], 1>; // Floating point reciprocal estimate (YMM).
932 defm : X86WriteResPairUnsupported<WriteFRcpZ>; // Floating point reciprocal estimate (ZMM).
933 defm : Zn3WriteResXMMPair<WriteFRsqrt, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate.
934 defm : Zn3WriteResXMMPair<WriteFRsqrtX, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (XMM).
935 defm : Zn3WriteResYMMPair<WriteFRsqrtY, [Zn3FPFDiv], 3, [1], 1>; // Floating point reciprocal square root estimate (YMM).
936 defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; // Floating point reciprocal square root estimate (ZMM).
941 defm : Zn3WriteResXMMPair<WriteDPPD, [Zn3FPFMul01], 9, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
942 defm : Zn3WriteResXMMPair<WriteDPPS, [Zn3FPFMul01], 15, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
943 defm : Zn3WriteResYMMPair<WriteDPPSY, [Zn3FPFMul01], 15, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
944 defm : Zn3WriteResXMMPair<WriteFSign, [Zn3FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis // Floating point fabs/fchs.
945 defm : Zn3WriteResXMMPair<WriteFRnd, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding.
946 defm : Zn3WriteResYMMPair<WriteFRndY, [Zn3FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
947 defm : X86WriteResPairUnsupported<WriteFRndZ>; // Floating point rounding (ZMM).
948 defm : Zn3WriteResXMMPair<WriteFLogic, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
949 defm : Zn3WriteResYMMPair<WriteFLogicY, [Zn3FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
950 defm : X86WriteResPairUnsupported<WriteFLogicZ>; // Floating point and/or/xor logicals (ZMM).
951 defm : Zn3WriteResXMMPair<WriteFTest, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
952 defm : Zn3WriteResYMMPair<WriteFTestY, [Zn3FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
953 defm : X86WriteResPairUnsupported<WriteFTestZ>; // Floating point TEST instructions (ZMM).
954 defm : Zn3WriteResXMMPair<WriteFShuffle, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
955 defm : Zn3WriteResYMMPair<WriteFShuffleY, [Zn3FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
956 defm : X86WriteResPairUnsupported<WriteFShuffleZ>; // Floating point vector shuffles (ZMM).
957 defm : Zn3WriteResXMMPair<WriteFVarShuffle, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
958 defm : Zn3WriteResYMMPair<WriteFVarShuffleY, [Zn3FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
959 defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; // Floating point vector variable shuffles (ZMM).
960 defm : Zn3WriteResXMMPair<WriteFBlend, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends.
961 defm : Zn3WriteResYMMPair<WriteFBlendY, [Zn3FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
962 defm : X86WriteResPairUnsupported<WriteFBlendZ>; // Floating point vector blends (ZMM).
968 defm : Zn3WriteResXMMPair<WriteFHAdd, [Zn3FPFAdd0], 6, [2], 4>;
969 defm : Zn3WriteResYMMPair<WriteFHAddY, [Zn3FPFAdd0], 6, [2], 3, /*LoadUOps=*/1>;
971 defm : Zn3WriteResXMMPair<WritePHAdd, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
972 defm : Zn3WriteResXMMPair<WritePHAddX, [Zn3FPVAdd0], 2, [2], 4>;
973 defm : Zn3WriteResYMMPair<WritePHAddY, [Zn3FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
1016 defm : Zn3WriteResXMM<WriteVecMoveToGpr, [Zn3FPLd01], 1, [2], 1>;
1017 defm : Zn3WriteResXMM<WriteVecMoveFromGpr, [Zn3FPLd01], 1, [2], 1>;
1021 let ReleaseAtCycles = [1, 2];
1022 let NumMicroOps = 2;
1029 let NumMicroOps = 2;
1045 let NumMicroOps = 2;
1097 defm : Zn3WriteResXMMPair<WriteVecTest, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1098 defm : Zn3WriteResYMMPair<WriteVecTestY, [Zn3FPVAdd12, Zn3FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions (YMM).
1129 defm : Zn3WriteResXMMPair<WritePSADBW, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW.
1130 defm : Zn3WriteResXMMPair<WritePSADBWX, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1131 defm : Zn3WriteResYMMPair<WritePSADBWY, [Zn3FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1133 defm : Zn3WriteResXMMPair<WriteMPSAD, [Zn3FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1139 defm : Zn3WriteResXMMPair<WriteVecInsert, [Zn3FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1140 defm : Zn3WriteResXMM<WriteVecExtract, [Zn3FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1141 defm : Zn3WriteResXMM<WriteVecExtractSt, [Zn3FPSt, Zn3Store], !add(1, Znver3Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1150 defm : Zn3WriteResXMMPair<WriteCvtSD2I, [Zn3FPFCvt01], 2, [2], 2>; // Double -> Integer.
1151 defm : Zn3WriteResXMMPair<WriteCvtPD2I, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Integer (XMM).
1152 defm : Zn3WriteResYMMPair<WriteCvtPD2IY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Integer (YMM).
1153 defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; // Double -> Integer (ZMM).
1157 let ReleaseAtCycles = [2];
1158 let NumMicroOps = 2;
1162 defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>; // Float -> Integer.
1164 defm : Zn3WriteResXMMPair<WriteCvtPS2I, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1165 defm : Zn3WriteResYMMPair<WriteCvtPS2IY, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Integer (YMM).
1166 defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; // Float -> Integer (ZMM).
1168 defm : Zn3WriteResXMMPair<WriteCvtI2SD, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double.
1169 defm : Zn3WriteResXMMPair<WriteCvtI2PD, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1170 defm : Zn3WriteResYMMPair<WriteCvtI2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1171 defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; // Integer -> Double (ZMM).
1174 let Latency = 2;
1176 let NumMicroOps = 2;
1180 defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float.
1181 defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1182 defm : Zn3WriteResYMMPair<WriteCvtI2PSY, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1183 defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; // Integer -> Float (ZMM).
1188 let NumMicroOps = 2;
1192 defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion.
1193 defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1194 defm : Zn3WriteResYMMPair<WriteCvtPS2PDY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1195 defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; // Float -> Double size conversion (ZMM).
1197 defm : Zn3WriteResXMMPair<WriteCvtSD2SS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion.
1198 defm : Zn3WriteResXMMPair<WriteCvtPD2PS, [Zn3FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1199 defm : Zn3WriteResYMMPair<WriteCvtPD2PSY, [Zn3FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1200 defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; // Double -> Float size conversion (ZMM).
1202 defm : Zn3WriteResXMMPair<WriteCvtPH2PS, [Zn3FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1203 defm : Zn3WriteResYMMPair<WriteCvtPH2PSY, [Zn3FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1204 defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; // Half -> Float size conversion (ZMM).
1206 defm : Zn3WriteResXMM<WriteCvtPS2PH, [Zn3FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1207 defm : Zn3WriteResYMM<WriteCvtPS2PHY, [Zn3FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1208 defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; // Float -> Half size conversion (ZMM).
1209 defm : Zn3WriteResXMM<WriteCvtPS2PHSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(3, Znver3Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1210 defm : Zn3WriteResYMM<WriteCvtPS2PHYSt, [Zn3FPFCvt01, Zn3FPSt, Zn3Store], !add(6, Znver3Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1211 defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; // Float -> Half + store size conversion (ZMM).
1217 let Latency = 2;
1218 let ReleaseAtCycles = [2];
1219 let NumMicroOps = 2;
1225 let ReleaseAtCycles = [1, 1, 2];
1232 let ReleaseAtCycles = [2];
1239 let ReleaseAtCycles = [1, 1, 2];
1245 let Latency = 2;
1247 let NumMicroOps = 2;
1292 defm : Zn3WriteResXMMPair<WritePCmpIStrI, [Zn3FPVAdd0123], 2, [8], 4>;
1301 // Carry-less multiplication instructions.
1305 defm : Zn3WriteResInt<WriteEMMS, [Zn3ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1308 defm : Zn3WriteResInt<WriteLDMXCSR, [Zn3AGU012, Zn3Load, Zn3ALU0123], !add(Znver3Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1309 defm : Zn3WriteResInt<WriteSTMXCSR, [Zn3ALU0123, Zn3AGU012, Zn3Store], !add(1, Znver3Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1311 // Catch-all for expensive system instructions.
1315 let Latency = 0; // FIXME: not from llvm-exegesis
1322 let Latency = 10; // FIXME: not from llvm-exegesis
1329 defm : Zn3WriteResYMMPair<WriteFShuffle256, [Zn3FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1330 defm : Zn3WriteResYMMPair<WriteFVarShuffle256, [Zn3FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1331 defm : Zn3WriteResYMMPair<WriteShuffle256, [Zn3FPVShuf], 2, [1], 1>; // 256-bit width vector shuffles.
1349 let ReleaseAtCycles = [1, 1, 2];
1357 let NumMicroOps = 2;
1363 let ReleaseAtCycles = [1, 1, 2];
1370 let ReleaseAtCycles = [1, 1, 2];
1371 let NumMicroOps = 2;
1375 defm : Zn3WriteResYMMPair<WriteVPMOV256, [Zn3FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1376 defm : Zn3WriteResYMMPair<WriteVarShuffle256, [Zn3FPVShuf], 5, [1], 2, /*LoadUOps=*/1>; // 256-bit width vector variable shuffles.
1402 defm : Zn3WriteResInt<WriteNop, [Zn3ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1421 let NumMicroOps = 2;
1426 defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>; // Compare+Exchange - TODO RMW support.
1502 // NOTE: XORPSrr, XORPDrr are not zero-cycle!
1517 // NOTE: PXORrr,PANDNrr are not zero-cycle!
1531 // PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1545 // GPR Zero-idioms.
1551 // SSE XMM Zero-idioms.
1566 // AVX XMM Zero-idioms.
1581 // AVX YMM Zero-idioms.