xref: /llvm-project/llvm/lib/Target/X86/X86ScheduleZnver4.td (revision 9cd774d1e49f792b7546e5309c7b27d653b37132)
1//=- X86ScheduleZnver4.td - X86 Znver4 Scheduling ------------*- tablegen -*-=//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the machine model for Znver4 to support instruction
10// scheduling and other instruction cost heuristics.
11// Based on:
12//  * AMD Software Optimization Guide for AMD Family 19h Processors.
13//    https://www.amd.com/system/files/TechDocs/56665.zip
14//===----------------------------------------------------------------------===//
15
16def Znver4Model : SchedMachineModel {
17  // AMD SOG 19h, 2.9.6 Dispatch
18  // The processor may dispatch up to 6 macro ops per cycle
19  // into the execution engine.
20  let IssueWidth = 6;
21  // AMD SOG 19h, 2.10.3
22  // The retire control unit (RCU) tracks the completion status of all
23  // outstanding operations (integer, load/store, and floating-point) and is
24  // the final arbiter for exception processing and recovery.
25  // The unit can receive up to 6 macro ops dispatched per cycle and track up
26  // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
27  let MicroOpBufferSize = 320;
28  // AMD SOG 19h, 2.9.1 Op Cache
29  // The op cache is organized as an associative cache with 64 sets and 8 ways.
30  // At each set-way intersection is an entry containing up to 8 macro ops.
31  // The maximum capacity of the op cache is 6.75K ops.
32  // Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
33  // the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop
34  // unrolling leading to excessive filling of the op-cache from frontend.
35  let LoopMicroOpBufferSize = 108;
36  // AMD SOG 19h, 2.6.2 L1 Data Cache
37  // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
38  // AMD SOG 19h, 2.12 L1 Data Cache
39  // The AGU and LS pipelines are optimized for simple address generation modes.
40  // <...> and can achieve 4-cycle load-to-use integer load latency.
41  let LoadLatency = 4;
42  // AMD SOG 19h, 2.12 L1 Data Cache
43  // The AGU and LS pipelines are optimized for simple address generation modes.
44  // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
45  int VecLoadLatency = 7;
46  // Latency of a simple store operation.
47  int StoreLatency = 1;
48  // FIXME:
49  let HighLatency = 25; // FIXME: any better choice?
50  // AMD SOG 19h, 2.8 Optimizing Branching
51  // The branch misprediction penalty is in the range from 11 to 18 cycles,
52  // <...>. The common case penalty is 13 cycles.
53  let MispredictPenalty = 13;
54
55  let PostRAScheduler = 1; // Enable Post RegAlloc Scheduler pass.
56
57  let CompleteModel = 1;
58}
59
60let SchedModel = Znver4Model in {
61
62
63//===----------------------------------------------------------------------===//
64// RCU
65//===----------------------------------------------------------------------===//
66
67// AMD SOG 19h, 2.10.3 Retire Control Unit
68// The unit can receive up to 6 macro ops dispatched per cycle and track up to
69// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
70// The retire unit handles in-order commit of up to nine macro ops per cycle.
71def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
72
73//===----------------------------------------------------------------------===//
74// Integer Execution Unit
75//
76
77// AMD SOG 19h, 2.4 Superscalar Organization
78// The processor uses four decoupled independent integer scheduler queues,
79// each one servicing one ALU pipeline and one or two other pipelines
80
81//
82// Execution pipes
83//===----------------------------------------------------------------------===//
84
85// AMD SOG 19h, 2.10.2 Execution Units
86// The processor contains 4 general purpose integer execution pipes.
87// Each pipe has an ALU capable of general purpose integer operations.
88def Zn4ALU0 : ProcResource<1>;
89def Zn4ALU1 : ProcResource<1>;
90def Zn4ALU2 : ProcResource<1>;
91def Zn4ALU3 : ProcResource<1>;
92
93// AMD SOG 19h, 2.10.2 Execution Units
94// There is also a separate branch execution unit.
95def Zn4BRU1 : ProcResource<1>;
96
97// AMD SOG 19h, 2.10.2 Execution Units
98// There are three Address Generation Units (AGUs) for all load and store
99// address generation. There are also 3 store data movement units
100// associated with the same schedulers as the AGUs.
101def Zn4AGU0 : ProcResource<1>;
102def Zn4AGU1 : ProcResource<1>;
103def Zn4AGU2 : ProcResource<1>;
104
105//
106// Execution Units
107//===----------------------------------------------------------------------===//
108
109// AMD SOG 19h, 2.10.2 Execution Units
110// ALU0 additionally has divide <...> execution capability.
111defvar Zn4Divider = Zn4ALU0;
112
113// AMD SOG 19h, 2.10.2 Execution Units
114// ALU0 additionally has <...> branch execution capability.
115defvar Zn4BRU0 = Zn4ALU0;
116
117// Integer Multiplication issued on ALU1.
118defvar Zn4Multiplier = Zn4ALU1;
119
120// Execution pipeline grouping
121//===----------------------------------------------------------------------===//
122
123// General ALU operations
124def Zn4ALU0123 : ProcResGroup<[Zn4ALU0, Zn4ALU1, Zn4ALU2, Zn4ALU3]>;
125
126// General AGU operations
127def Zn4AGU012 : ProcResGroup<[Zn4AGU0, Zn4AGU1, Zn4AGU2]>;
128
129// Control flow: jumps, calls
130def Zn4BRU01 : ProcResGroup<[Zn4BRU0, Zn4BRU1]>;
131
132// Everything that isn't control flow, but still needs to access CC register,
133// namely: conditional moves, SETcc.
134def Zn4ALU03 : ProcResGroup<[Zn4ALU0, Zn4ALU3]>;
135
136// Zn4ALU1 handles complex bit twiddling: CRC/PDEP/PEXT
137
138// Simple bit twiddling: bit test, shift/rotate, bit extraction
139def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
140
141
142//
143// Scheduling
144//===----------------------------------------------------------------------===//
145
146// AMD SOG 19h, 2.10.3 Retire Control Unit
147// The integer physical register file (PRF) consists of 224 registers.
148def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
149                              6,  // Max moves that can be eliminated per cycle.
150                              0>; // Restrict move elimination to zero regs.
151
152// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
153// AMD SOG 19h, 2.10.1 Schedulers
154// The schedulers can receive up to six macro ops per cycle, with a limit of
155// two per scheduler. Each scheduler can issue one micro op per cycle into
156// each of its associated pipelines
157def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
158                           Zn4ALU1, Zn4AGU1,          // scheduler 1
159                           Zn4ALU2, Zn4AGU2,          // scheduler 2
160                           Zn4ALU3,          Zn4BRU1  // scheduler 3
161                          ]> {
162  let BufferSize = !mul(4, 24);
163}
164
165
166//===----------------------------------------------------------------------===//
167// Floating-Point Unit
168//
169
170// AMD SOG 19h, 2.4 Superscalar Organization
171// The processor uses <...> two decoupled independent floating point schedulers
172// each servicing two FP pipelines and one store or FP-to-integer pipeline.
173
174//
175// Execution pipes
176//===----------------------------------------------------------------------===//
177
178// AMD SOG 19h, 2.10.1 Schedulers
179// <...>, and six FPU pipes.
180// Agner, 22.10 Floating point execution pipes
181// There are six floating point/vector execution pipes,
182def Zn4FP0  : ProcResource<1>;
183def Zn4FP1  : ProcResource<1>;
184def Zn4FP2  : ProcResource<1>;
185def Zn4FP3  : ProcResource<1>;
186def Zn4FP45 : ProcResource<2>;
187
188//
189// Execution Units
190//===----------------------------------------------------------------------===//
191// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
192
193// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
194defvar Zn4FPFMul0 = Zn4FP0;
195defvar Zn4FPFMul1 = Zn4FP1;
196
197// (v)FADD*
198defvar Zn4FPFAdd0 = Zn4FP2;
199defvar Zn4FPFAdd1 = Zn4FP3;
200
201// All convert operations except pack/unpack
202defvar Zn4FPFCvt0 = Zn4FP2;
203defvar Zn4FPFCvt1 = Zn4FP3;
204
205// All Divide and Square Root except Reciprocal Approximation
206// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
207// FDIV unit can support 2 simultaneous operations in flight
208// even though it occupies a single pipe.
209// FIXME: BufferSize=2 ?
210defvar Zn4FPFDiv = Zn4FP1;
211
212// Moves and Logical operations on Floating Point Data Types
213defvar Zn4FPFMisc0 = Zn4FP0;
214defvar Zn4FPFMisc1 = Zn4FP1;
215defvar Zn4FPFMisc2 = Zn4FP2;
216defvar Zn4FPFMisc3 = Zn4FP3;
217
218// Integer Adds, Subtracts, and Compares
219// Some complex VADD operations are not available in all pipes.
220defvar Zn4FPVAdd0 = Zn4FP0;
221defvar Zn4FPVAdd1 = Zn4FP1;
222defvar Zn4FPVAdd2 = Zn4FP2;
223defvar Zn4FPVAdd3 = Zn4FP3;
224
225// Integer Multiplies, SAD, Blendvb
226defvar Zn4FPVMul0 = Zn4FP0;
227defvar Zn4FPVMul1 = Zn4FP3;
228
229// Data Shuffles, Packs, Unpacks, Permute
230// Some complex shuffle operations are only available in pipe1.
231defvar Zn4FPVShuf = Zn4FP1;
232defvar Zn4FPVShufAux = Zn4FP2;
233
234// Bit Shift Left/Right operations
235defvar Zn4FPVShift0 = Zn4FP1;
236defvar Zn4FPVShift1 = Zn4FP2;
237
238// Moves and Logical operations on Packed Integer Data Types
239defvar Zn4FPVMisc0 = Zn4FP0;
240defvar Zn4FPVMisc1 = Zn4FP1;
241defvar Zn4FPVMisc2 = Zn4FP2;
242defvar Zn4FPVMisc3 = Zn4FP3;
243
244// *AES*
245defvar Zn4FPAES0 = Zn4FP0;
246defvar Zn4FPAES1 = Zn4FP1;
247
248// *CLM*
249defvar Zn4FPCLM0 = Zn4FP0;
250defvar Zn4FPCLM1 = Zn4FP1;
251
252// Execution pipeline grouping
253//===----------------------------------------------------------------------===//
254
255// AMD SOG 19h, 2.11 Floating-Point Unit
256// Stores and floating point to general purpose register transfer
257// have 2 dedicated pipelines (pipe 5 and 6).
258def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
259
260// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
261def Zn4FPFMul01 : ProcResGroup<[Zn4FPFMul0, Zn4FPFMul1]>;
262
263// (v)FADD*
264// Some complex VADD operations are not available in all pipes.
265def Zn4FPFAdd01 : ProcResGroup<[Zn4FPFAdd0, Zn4FPFAdd1]>;
266
267// All convert operations except pack/unpack
268def Zn4FPFCvt01 : ProcResGroup<[Zn4FPFCvt0, Zn4FPFCvt1]>;
269
270// All Divide and Square Root except Reciprocal Approximation
271// def Zn4FPFDiv : ProcResGroup<[Zn4FPFDiv]>;
272
273// Moves and Logical operations on Floating Point Data Types
274def Zn4FPFMisc0123 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1, Zn4FPFMisc2, Zn4FPFMisc3]>;
275
276// FIXUP and RANGE use FP01 pipelines
277def Zn4FPFMisc01 : ProcResGroup<[Zn4FPFMisc0, Zn4FPFMisc1]>;
278def Zn4FPFMisc12 : ProcResGroup<[Zn4FPFMisc1, Zn4FPFMisc2]>;
279// SCALE instructions use FP23 pipelines
280def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
281def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
282
283// Loads, Stores and Move to General Register (EX) Operations
284// AMD SOG 19h, 2.11 Floating-Point Unit
285// Stores and floating point to general purpose register transfer
286// have 2 dedicated pipelines (pipe 5 and 6).
287defvar Zn4FPLd01 = Zn4FP45;
288
289// AMD SOG 19h, 2.11 Floating-Point Unit
290// Note that FP stores are supported on two pipelines,
291// but throughput is limited to one per cycle.
292let Super = Zn4FP45 in
293def Zn4FPSt : ProcResource<1>;
294
295// Integer Adds, Subtracts, and Compares
296// Some complex VADD operations are not available in all pipes.
297def Zn4FPVAdd0123 : ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1, Zn4FPVAdd2, Zn4FPVAdd3]>;
298
299def Zn4FPVAdd01: ProcResGroup<[Zn4FPVAdd0, Zn4FPVAdd1]>;
300def Zn4FPVAdd12: ProcResGroup<[Zn4FPVAdd1, Zn4FPVAdd2]>;
301
302// AVX512 Opmask pipelines
303def Zn4FPOpMask01: ProcResGroup<[Zn4FP2, Zn4FP3]>;
304def Zn4FPOpMask4: ProcResGroup<[Zn4FP45]>;
305
306// Integer Multiplies, SAD, Blendvb
307def Zn4FPVMul01 : ProcResGroup<[Zn4FPVMul0, Zn4FPVMul1]>;
308
309// Data Shuffles, Packs, Unpacks, Permute
310// Some complex shuffle operations are only available in pipe1.
311def Zn4FPVShuf01 : ProcResGroup<[Zn4FPVShuf, Zn4FPVShufAux]>;
312
313// Bit Shift Left/Right operations
314def Zn4FPVShift01 : ProcResGroup<[Zn4FPVShift0, Zn4FPVShift1]>;
315
316// Moves and Logical operations on Packed Integer Data Types
317def Zn4FPVMisc0123 : ProcResGroup<[Zn4FPVMisc0, Zn4FPVMisc1, Zn4FPVMisc2, Zn4FPVMisc3]>;
318
319// *AES*
320def Zn4FPAES01 : ProcResGroup<[Zn4FPAES0, Zn4FPAES1]>;
321
322// *CLM*
323def Zn4FPCLM01 : ProcResGroup<[Zn4FPCLM0, Zn4FPCLM1]>;
324
325
326//
327// Scheduling
328//===----------------------------------------------------------------------===//
329
330// Agner, 21.8 Register renaming and out-of-order schedulers
331// The floating point register file has 192 vector registers
332// of 512b each in zen4.
333def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 1, 1],
334                            6,  // Max moves that can be eliminated per cycle.
335                            0>; // Restrict move elimination to zero regs.
336
337// AMD SOG 19h, 2.11 Floating-Point Unit
338// The floating-point scheduler has a 2*32 entry macro op capacity.
339// AMD SOG 19h, 2.11 Floating-Point Unit
340// <...> the scheduler can issue 1 micro op per cycle for each pipe.
341// FIXME: those are two separate schedulers, not a single big one.
342def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2,          /*Zn4FP4,*/ // scheduler 0
343                          Zn4FP1, Zn4FP3, Zn4FP45 /*Zn4FP5*/  // scheduler 1
344                         ]> {
345  let BufferSize = !mul(2, 32);
346}
347
348// AMD SOG 19h, 2.11 Floating-Point Unit
349// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
350// even if floating-point scheduler is full.
351// FIXME: how to model this properly?
352
353
354//===----------------------------------------------------------------------===//
355// Load-Store Unit
356//
357
358// AMD SOG 19h, 2.12 Load-Store Unit
359// The LS unit contains three largely independent pipe-lines
360// enabling the execution of three 256-bit memory operations per cycle.
361def Zn4LSU : ProcResource<3>;
362
363// AMD SOG 19h, 2.12 Load-Store Unit
364// All three memory operations can be loads.
365let Super = Zn4LSU in
366def Zn4Load : ProcResource<3> {
367  // AMD SOG 19h, 2.12 Load-Store Unit
368  // The LS unit can process up to 72 out-of-order loads.
369  let BufferSize = 72;
370}
371
372def Zn4LoadQueue : LoadQueue<Zn4Load>;
373
374// AMD SOG 19h, 2.12 Load-Store Unit
375// A maximum of two of the memory operations can be stores.
376let Super = Zn4LSU in
377def Zn4Store : ProcResource<2> {
378  // AMD SOG 19h, 2.12 Load-Store Unit
379  // The LS unit utilizes a 64-entry store queue (STQ).
380  let BufferSize = 64;
381}
382
383def Zn4StoreQueue : StoreQueue<Zn4Store>;
384
385//===----------------------------------------------------------------------===//
386// Basic helper classes.
387//===----------------------------------------------------------------------===//
388
389// Many SchedWrites are defined in pairs with and without a folded load.
390// Instructions with folded loads are usually micro-fused, so they only appear
391// as two micro-ops when dispatched by the schedulers.
392// This multiclass defines the resource usage for variants with and without
393// folded loads.
394
395multiclass __Zn4WriteRes<SchedWrite SchedRW, list<ProcResourceKind> ExePorts,
396                         int Lat = 1, list<int> Res = [], int UOps = 1> {
397  def : WriteRes<SchedRW, ExePorts> {
398    let Latency = Lat;
399    let ReleaseAtCycles = Res;
400    let NumMicroOps = UOps;
401  }
402}
403
404multiclass __Zn4WriteResPair<X86FoldableSchedWrite SchedRW,
405                             list<ProcResourceKind> ExePorts, int Lat,
406                             list<int> Res, int UOps, int LoadLat, int LoadUOps,
407                             ProcResourceKind AGU, int LoadRes> {
408  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
409
410  defm : __Zn4WriteRes<SchedRW.Folded,
411                       !listconcat([AGU, Zn4Load], ExePorts),
412                       !add(Lat, LoadLat),
413                       !if(!and(!empty(Res), !eq(LoadRes, 1)),
414                         [],
415                         !listconcat([1, LoadRes],
416                           !if(!empty(Res),
417                             !listsplat(1, !size(ExePorts)),
418                             Res))),
419                       !add(UOps, LoadUOps)>;
420}
421
422// For classes without folded loads.
423multiclass Zn4WriteResInt<SchedWrite SchedRW,
424                          list<ProcResourceKind> ExePorts, int Lat = 1,
425                          list<int> Res = [], int UOps = 1> {
426  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
427}
428
429multiclass Zn4WriteResXMM<SchedWrite SchedRW,
430                          list<ProcResourceKind> ExePorts, int Lat = 1,
431                          list<int> Res = [], int UOps = 1> {
432  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
433}
434
435multiclass Zn4WriteResYMM<SchedWrite SchedRW,
436                          list<ProcResourceKind> ExePorts, int Lat = 1,
437                          list<int> Res = [], int UOps = 1> {
438  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
439}
440
441multiclass Zn4WriteResZMM<SchedWrite SchedRW,
442                          list<ProcResourceKind> ExePorts, int Lat = 1,
443                          list<int> Res = [], int UOps = 1> {
444  defm : __Zn4WriteRes<SchedRW, ExePorts, Lat, Res, UOps>;
445}
446
447// For classes with folded loads.
448multiclass Zn4WriteResIntPair<X86FoldableSchedWrite SchedRW,
449                              list<ProcResourceKind> ExePorts, int Lat = 1,
450                              list<int> Res = [], int UOps = 1,
451                              int LoadUOps = 0, int LoadRes = 1> {
452  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
453                           Znver4Model.LoadLatency,
454                           LoadUOps, Zn4AGU012, LoadRes>;
455}
456
457multiclass Zn4WriteResXMMPair<X86FoldableSchedWrite SchedRW,
458                              list<ProcResourceKind> ExePorts, int Lat = 1,
459                              list<int> Res = [], int UOps = 1,
460                              int LoadUOps = 0, int LoadRes = 1> {
461  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
462                           Znver4Model.VecLoadLatency,
463                           LoadUOps, Zn4FPLd01, LoadRes>;
464}
465
466multiclass Zn4WriteResYMMPair<X86FoldableSchedWrite SchedRW,
467                              list<ProcResourceKind> ExePorts, int Lat = 1,
468                              list<int> Res = [], int UOps = 1,
469                              int LoadUOps = 0, int LoadRes = 1> {
470  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
471                           Znver4Model.VecLoadLatency,
472                           LoadUOps, Zn4FPLd01, LoadRes>;
473}
474
475multiclass Zn4WriteResZMMPair<X86FoldableSchedWrite SchedRW,
476                              list<ProcResourceKind> ExePorts, int Lat = 1,
477                              list<int> Res = [], int UOps = 2,
478                              int LoadUOps = 0, int LoadRes = 1> {
479  defm : __Zn4WriteResPair<SchedRW, ExePorts, Lat, Res, UOps,
480                           Znver4Model.VecLoadLatency,
481                           LoadUOps, Zn4FPLd01, LoadRes>;
482}
483
484//===----------------------------------------------------------------------===//
485// Here be dragons.
486//===----------------------------------------------------------------------===//
487
488def : ReadAdvance<ReadAfterLd, Znver4Model.LoadLatency>;
489
490def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
491def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
492def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
493
494// AMD SOG 19h, 2.11 Floating-Point Unit
495// There is 1 cycle of added latency for a result to cross
496// from F to I or I to F domain.
497def : ReadAdvance<ReadInt2Fpu, -1>;
498
499// Instructions with both a load and a store folded are modeled as a folded
500// load + WriteRMW.
501defm : Zn4WriteResInt<WriteRMW, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 1], 0>;
502
503// Loads, stores, and moves, not folded with other operations.
504defm : Zn4WriteResInt<WriteLoad, [Zn4AGU012, Zn4Load], !add(Znver4Model.LoadLatency, 1), [1, 1], 1>;
505
506// Model the effect of clobbering the read-write mask operand of the GATHER operation.
507// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
508defm : Zn4WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver4Model.LoadLatency, 1), [], 0>;
509
510def Zn4WriteMOVSlow : SchedWriteRes<[Zn4AGU012, Zn4Load]> {
511  let Latency = !add(Znver4Model.LoadLatency, 1);
512  let ReleaseAtCycles = [3, 1];
513  let NumMicroOps = 1;
514}
515def : InstRW<[Zn4WriteMOVSlow], (instrs MOV8rm, MOV8rm_NOREX, MOV16rm, MOVSX16rm16, MOVSX16rm32, MOVZX16rm16, MOVSX16rm8, MOVZX16rm8)>;
516
517defm : Zn4WriteResInt<WriteStore, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
518defm : Zn4WriteResInt<WriteStoreNT, [Zn4AGU012, Zn4Store], Znver4Model.StoreLatency, [1, 2], 1>;
519defm : Zn4WriteResInt<WriteMove, [Zn4ALU0123], 1, [4], 1>;
520
521// Treat misc copies as a move.
522def : InstRW<[WriteMove], (instrs COPY)>;
523
524def Zn4WriteMOVBE16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
525  let Latency = Znver4Model.LoadLatency;
526  let ReleaseAtCycles = [1, 1, 4];
527  let NumMicroOps = 1;
528}
529def : InstRW<[Zn4WriteMOVBE16rm], (instrs MOVBE16rm)>;
530
531def Zn4WriteMOVBEmr : SchedWriteRes<[Zn4ALU0123, Zn4AGU012, Zn4Store]> {
532  let Latency = Znver4Model.StoreLatency;
533  let ReleaseAtCycles = [4, 1, 1];
534  let NumMicroOps = 2;
535}
536def : InstRW<[Zn4WriteMOVBEmr], (instrs MOVBE16mr, MOVBE32mr, MOVBE64mr)>;
537
538// Arithmetic.
539defm : Zn4WriteResIntPair<WriteALU, [Zn4ALU0123], 1, [1], 1>; // Simple integer ALU op.
540
541def Zn4WriteALUSlow : SchedWriteRes<[Zn4ALU0123]> {
542  let Latency = 1;
543  let ReleaseAtCycles = [4];
544  let NumMicroOps = 1;
545}
546def : InstRW<[Zn4WriteALUSlow], (instrs ADD8i8, ADD16i16, ADD32i32, ADD64i32,
547                                        AND8i8, AND16i16, AND32i32, AND64i32,
548                                         OR8i8,  OR16i16,  OR32i32,  OR64i32,
549                                        SUB8i8, SUB16i16, SUB32i32, SUB64i32,
550                                        XOR8i8, XOR16i16, XOR32i32, XOR64i32)>;
551
552def Zn4WriteMoveExtend : SchedWriteRes<[Zn4ALU0123]> {
553  let Latency = 1;
554  let ReleaseAtCycles = [4];
555  let NumMicroOps = 1;
556}
557def : InstRW<[Zn4WriteMoveExtend], (instrs MOVSX16rr16, MOVSX16rr32, MOVZX16rr16, MOVSX16rr8, MOVZX16rr8)>;
558
559def Zn4WriteMaterialize32bitImm: SchedWriteRes<[Zn4ALU0123]> {
560  let Latency = 1;
561  let ReleaseAtCycles = [2];
562  let NumMicroOps = 1;
563}
564def : InstRW<[Zn4WriteMaterialize32bitImm], (instrs MOV32ri, MOV32ri_alt, MOV64ri32)>;
565
566def Zn4WritePDEP_PEXT : SchedWriteRes<[Zn4ALU1]> {
567  let Latency = 3;
568  let ReleaseAtCycles = [1];
569  let NumMicroOps = 1;
570}
571def : InstRW<[Zn4WritePDEP_PEXT], (instrs PDEP32rr, PDEP64rr,
572                                          PEXT32rr, PEXT64rr)>;
573
574defm : Zn4WriteResIntPair<WriteADC, [Zn4ALU0123], 1, [4], 1>; // Integer ALU + flags op.
575
576def Zn4WriteADC8mr_SBB8mr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123, Zn4Store]> {
577  let Latency = 1;
578  let ReleaseAtCycles = [1, 1, 7, 1];
579  let NumMicroOps = 1;
580}
581def : InstRW<[Zn4WriteADC8mr_SBB8mr], (instrs ADC8mr, SBB8mr)>;
582
583// This is for simple LEAs with one or two input operands.
584defm : Zn4WriteResInt<WriteLEA, [Zn4AGU012], 1, [1], 1>;     // LEA instructions can't fold loads.
585
586// This write is used for slow LEA instructions.
587def Zn4Write3OpsLEA : SchedWriteRes<[Zn4ALU0123]> {
588  let Latency = 2;
589  let ReleaseAtCycles = [1];
590  let NumMicroOps = 2;
591}
592
593// On Znver4, a slow LEA is either a 3Ops LEA (base, index, offset),
594// or an LEA with a `Scale` value different than 1.
595def Zn4SlowLEAPredicate : MCSchedPredicate<
596  CheckAny<[
597    // A 3-operand LEA (base, index, offset).
598    IsThreeOperandsLEAFn,
599    // An LEA with a "Scale" different than 1.
600    CheckAll<[
601      CheckIsImmOperand<2>,
602      CheckNot<CheckImmOperand<2, 1>>
603    ]>
604  ]>
605>;
606
607def Zn4WriteLEA : SchedWriteVariant<[
608    SchedVar<Zn4SlowLEAPredicate, [Zn4Write3OpsLEA]>,
609    SchedVar<NoSchedPred,         [WriteLEA]>
610]>;
611
612def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
613
614def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
615  let Latency = 2; // FIXME: not from llvm-exegesis
616  let ReleaseAtCycles = [4];
617  let NumMicroOps = 2;
618}
619
620def : InstRW<[Zn4SlowLEA16r], (instrs LEA16r)>;
621
622// Integer multiplication
623defm : Zn4WriteResIntPair<WriteIMul8, [Zn4Multiplier], 3, [3], 1>; // Integer 8-bit multiplication.
624defm : Zn4WriteResIntPair<WriteIMul16, [Zn4Multiplier], 3, [3], 3, /*LoadUOps=*/1>; // Integer 16-bit multiplication.
625defm : Zn4WriteResIntPair<WriteIMul16Imm, [Zn4Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
626defm : Zn4WriteResIntPair<WriteIMul16Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
627defm : Zn4WriteResIntPair<WriteIMul32, [Zn4Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
628defm : Zn4WriteResIntPair<WriteMULX32, [Zn4Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
629defm : Zn4WriteResIntPair<WriteIMul32Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
630defm : Zn4WriteResIntPair<WriteIMul32Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
631defm : Zn4WriteResIntPair<WriteIMul64, [Zn4Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
632defm : Zn4WriteResIntPair<WriteMULX64, [Zn4Multiplier], 3, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
633defm : Zn4WriteResIntPair<WriteIMul64Imm, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
634defm : Zn4WriteResIntPair<WriteIMul64Reg, [Zn4Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
635defm : Zn4WriteResInt<WriteIMulHLd, [], !add(4, Znver4Model.LoadLatency), [], 0>;  // Integer multiplication, high part.
636defm : Zn4WriteResInt<WriteIMulH, [], 4, [], 0>;  // Integer multiplication, high part.
637
638defm : Zn4WriteResInt<WriteBSWAP32, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 32-bit Swap.
639defm : Zn4WriteResInt<WriteBSWAP64, [Zn4ALU0123], 1, [1], 1>; // Byte Order (Endianness) 64-bit Swap.
640
641defm : Zn4WriteResIntPair<WriteCMPXCHG, [Zn4ALU0123], 3, [12], 5>; // Compare and set, compare and swap.
642
643def Zn4WriteCMPXCHG8rr : SchedWriteRes<[Zn4ALU0123]> {
644  let Latency = 3;
645  let ReleaseAtCycles = [12];
646  let NumMicroOps = 3;
647}
648def : InstRW<[Zn4WriteCMPXCHG8rr], (instrs CMPXCHG8rr)>;
649
650defm : Zn4WriteResInt<WriteCMPXCHGRMW, [Zn4ALU0123], 3, [12], 6>;     // Compare and set, compare and swap.
651
652def Zn4WriteCMPXCHG8rm_LCMPXCHG8 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
653  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteCMPXCHG8rr.Latency);
654  let ReleaseAtCycles = [1, 1, 12];
655  let NumMicroOps = !add(Zn4WriteCMPXCHG8rr.NumMicroOps, 2);
656}
657def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
658
659def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
660  let Latency = 3; // FIXME: not from llvm-exegesis
661  let ReleaseAtCycles = [24];
662  let NumMicroOps = 19;
663}
664def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
665
666def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
667  let Latency = 4; // FIXME: not from llvm-exegesis
668  let ReleaseAtCycles = [59];
669  let NumMicroOps = 28;
670}
671def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
672
673def Zn4WriteWriteXCHGUnrenameable : SchedWriteRes<[Zn4ALU0123]> {
674  let Latency = 1;
675  let ReleaseAtCycles = [2];
676  let NumMicroOps = 2;
677}
678def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16ar)>;
679
680def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
681  let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
682  let ReleaseAtCycles = [1, 1, 2];
683  let NumMicroOps = 5;
684}
685def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
686
687def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
688  let Latency = !add(Znver4Model.LoadLatency, 2); // FIXME: not from llvm-exegesis
689  let ReleaseAtCycles = [1, 1, 2];
690  let NumMicroOps = 2;
691}
692def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
693
694// Integer division.
695// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
696// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
697defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
698defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
699defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
700defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
701defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
702defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
703defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
704defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
705
706defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
707defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
708
709defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
710
711def Zn4WritePOPCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
712  let Latency = 1;
713  let ReleaseAtCycles = [4];
714  let NumMicroOps = 1;
715}
716def : InstRW<[Zn4WritePOPCNT16rr], (instrs POPCNT16rr)>;
717
718defm : Zn4WriteResIntPair<WriteLZCNT, [Zn4ALU0123], 1, [1], 1>; // Leading zero count.
719
720def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
721  let Latency = 1;
722  let ReleaseAtCycles = [4];
723  let NumMicroOps = 1;
724}
725def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
726
727defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
728
729def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
730  let Latency = 2;
731  let ReleaseAtCycles = [4];
732  let NumMicroOps = 2;
733}
734def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
735
736defm : Zn4WriteResIntPair<WriteCMOV, [Zn4ALU03], 1, [1], 1>; // Conditional move.
737defm : Zn4WriteResInt<WriteFCMOV, [Zn4ALU0123], 7, [28], 7>; // FIXME: not from llvm-exegesis // X87 conditional move.
738defm : Zn4WriteResInt<WriteSETCC, [Zn4ALU03], 1, [2], 1>; // Set register based on condition code.
739defm : Zn4WriteResInt<WriteSETCCStore, [Zn4ALU03, Zn4AGU012, Zn4Store], 2, [2, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
740defm : Zn4WriteResInt<WriteLAHFSAHF, [Zn4ALU3], 1, [1], 1>; // Load/Store flags in AH.
741
742defm : Zn4WriteResInt<WriteBitTest, [Zn4ALU12], 1, [1], 1>; // Bit Test
743defm : Zn4WriteResInt<WriteBitTestImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 2>;
744defm : Zn4WriteResInt<WriteBitTestRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 1), [1, 1, 1], 7>;
745
746defm : Zn4WriteResInt<WriteBitTestSet, [Zn4ALU12], 2, [2], 2>; // Bit Test + Set
747defm : Zn4WriteResInt<WriteBitTestSetImmLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 4>;
748defm : Zn4WriteResInt<WriteBitTestSetRegLd, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 1], 9>;
749
750// Integer shifts and rotates.
751defm : Zn4WriteResIntPair<WriteShift, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
752defm : Zn4WriteResIntPair<WriteShiftCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
753defm : Zn4WriteResIntPair<WriteRotate, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
754
755def Zn4WriteRotateR1 : SchedWriteRes<[Zn4ALU12]> {
756  let Latency = 1;
757  let ReleaseAtCycles = [2];
758  let NumMicroOps = 1;
759}
760def : InstRW<[Zn4WriteRotateR1], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
761                                         RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;
762
763def Zn4WriteRotateM1 : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
764  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateR1.Latency);
765  let ReleaseAtCycles = [1, 1, 2];
766  let NumMicroOps = !add(Zn4WriteRotateR1.NumMicroOps, 1);
767}
768def : InstRW<[Zn4WriteRotateM1], (instrs RCL8m1, RCL16m1, RCL32m1, RCL64m1,
769                                         RCR8m1, RCR16m1, RCR32m1, RCR64m1)>;
770
771def Zn4WriteRotateRightRI : SchedWriteRes<[Zn4ALU12]> {
772  let Latency = 3;
773  let ReleaseAtCycles = [6];
774  let NumMicroOps = 7;
775}
776def : InstRW<[Zn4WriteRotateRightRI], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;
777
778def Zn4WriteRotateRightMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
779  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRI.Latency);
780  let ReleaseAtCycles = [1, 1, 8];
781  let NumMicroOps = !add(Zn4WriteRotateRightRI.NumMicroOps, 3);
782}
783def : InstRW<[Zn4WriteRotateRightMI], (instrs RCR8mi, RCR16mi, RCR32mi, RCR64mi)>;
784
785def Zn4WriteRotateLeftRI : SchedWriteRes<[Zn4ALU12]> {
786  let Latency = 4;
787  let ReleaseAtCycles = [8];
788  let NumMicroOps = 9;
789}
790def : InstRW<[Zn4WriteRotateLeftRI], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;
791
792def Zn4WriteRotateLeftMI : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
793  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRI.Latency);
794  let ReleaseAtCycles = [1, 1, 8];
795  let NumMicroOps = !add(Zn4WriteRotateLeftRI.NumMicroOps, 2);
796}
797def : InstRW<[Zn4WriteRotateLeftMI], (instrs RCL8mi, RCL16mi, RCL32mi, RCL64mi)>;
798
799defm : Zn4WriteResIntPair<WriteRotateCL, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
800
801def Zn4WriteRotateRightRCL : SchedWriteRes<[Zn4ALU12]> {
802  let Latency = 3;
803  let ReleaseAtCycles = [6];
804  let NumMicroOps = 7;
805}
806def : InstRW<[Zn4WriteRotateRightRCL], (instrs RCR8rCL, RCR16rCL, RCR32rCL, RCR64rCL)>;
807
808def Zn4WriteRotateRightMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
809  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateRightRCL.Latency);
810  let ReleaseAtCycles = [1, 1, 8];
811  let NumMicroOps = !add(Zn4WriteRotateRightRCL.NumMicroOps, 2);
812}
813def : InstRW<[Zn4WriteRotateRightMCL], (instrs RCR8mCL, RCR16mCL, RCR32mCL, RCR64mCL)>;
814
815def Zn4WriteRotateLeftRCL : SchedWriteRes<[Zn4ALU12]> {
816  let Latency = 4;
817  let ReleaseAtCycles = [8];
818  let NumMicroOps = 9;
819}
820def : InstRW<[Zn4WriteRotateLeftRCL], (instrs RCL8rCL, RCL16rCL, RCL32rCL, RCL64rCL)>;
821
822def Zn4WriteRotateLeftMCL : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU12]> {
823  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteRotateLeftRCL.Latency);
824  let ReleaseAtCycles = [1, 1, 8];
825  let NumMicroOps = !add(Zn4WriteRotateLeftRCL.NumMicroOps, 2);
826}
827def : InstRW<[Zn4WriteRotateLeftMCL], (instrs RCL8mCL, RCL16mCL, RCL32mCL, RCL64mCL)>;
828
829// Double shift instructions.
830defm : Zn4WriteResInt<WriteSHDrri, [Zn4ALU12], 2, [3], 4>;
831defm : Zn4WriteResInt<WriteSHDrrcl, [Zn4ALU12], 2, [3], 5>;
832defm : Zn4WriteResInt<WriteSHDmri, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
833defm : Zn4WriteResInt<WriteSHDmrcl, [Zn4AGU012, Zn4Load, Zn4ALU12], !add(Znver4Model.LoadLatency, 2), [1, 1, 4], 6>;
834
835// BMI1 BEXTR/BLS, BMI2 BZHI
836defm : Zn4WriteResIntPair<WriteBEXTR, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
837defm : Zn4WriteResIntPair<WriteBLS, [Zn4ALU0123], 1, [1], 1, /*LoadUOps=*/1>;
838defm : Zn4WriteResIntPair<WriteBZHI, [Zn4ALU12], 1, [1], 1, /*LoadUOps=*/1>;
839
840// Idioms that clear a register, like xorps %xmm0, %xmm0.
841// These can often bypass execution ports completely.
842defm : Zn4WriteResInt<WriteZero, [Zn4ALU0123], 0, [0], 1>;
843
844// Branches don't produce values, so they have no latency, but they still
845// consume resources. Indirect branches can fold loads.
846defm : Zn4WriteResIntPair<WriteJump, [Zn4BRU01], 1, [1], 1>; // FIXME: not from llvm-exegesis
847
848// Floating point. This covers both scalar and vector operations.
849defm : Zn4WriteResInt<WriteFLD0, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 4), [1, 1, 1], 1>;
850defm : Zn4WriteResInt<WriteFLD1, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
851defm : Zn4WriteResInt<WriteFLDC, [Zn4FPLd01, Zn4Load, Zn4FP1], !add(Znver4Model.LoadLatency, 7), [1, 1, 1], 1>;
852defm : Zn4WriteResXMM<WriteFLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
853defm : Zn4WriteResXMM<WriteFLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
854defm : Zn4WriteResYMM<WriteFLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
855defm : Zn4WriteResXMM<WriteFMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
856defm : Zn4WriteResYMM<WriteFMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
857defm : Zn4WriteResXMM<WriteFStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
858
859def Zn4WriteWriteFStoreMMX : SchedWriteRes<[Zn4FPSt, Zn4Store]> {
860  let Latency = 2; // FIXME: not from llvm-exegesis
861  let ReleaseAtCycles = [1, 1];
862  let NumMicroOps = 2;
863}
864def : InstRW<[Zn4WriteWriteFStoreMMX], (instrs MOVHPDmr,  MOVHPSmr,
865                                               VMOVHPDmr, VMOVHPSmr)>;
866
867defm : Zn4WriteResXMM<WriteFStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
868defm : Zn4WriteResYMM<WriteFStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
869defm : Zn4WriteResXMM<WriteFStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
870defm : Zn4WriteResXMM<WriteFStoreNTX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
871defm : Zn4WriteResYMM<WriteFStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
872
873defm : Zn4WriteResXMM<WriteFMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
874defm : Zn4WriteResXMM<WriteFMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
875defm : Zn4WriteResYMM<WriteFMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
876defm : Zn4WriteResYMM<WriteFMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
877
878defm : Zn4WriteResXMMPair<WriteFAdd, [Zn4FPFAdd01], 3, [1], 1>;  // Floating point add/sub.
879
880def Zn4WriteX87Arith : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
881  let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
882  let ReleaseAtCycles = [1, 1, 24];
883  let NumMicroOps = 2;
884}
885def : InstRW<[Zn4WriteX87Arith], (instrs ADD_FI16m, ADD_FI32m,
886                                         SUB_FI16m, SUB_FI32m,
887                                         SUBR_FI16m, SUBR_FI32m,
888                                         MUL_FI16m, MUL_FI32m)>;
889
890def Zn4WriteX87Div : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
891  let Latency = !add(Znver4Model.LoadLatency, 1); // FIXME: not from llvm-exegesis
892  let ReleaseAtCycles = [1, 1, 62];
893  let NumMicroOps = 2;
894}
895def : InstRW<[Zn4WriteX87Div], (instrs DIV_FI16m, DIV_FI32m,
896                                       DIVR_FI16m, DIVR_FI32m)>;
897
898defm : Zn4WriteResXMMPair<WriteFAddX, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (XMM).
899defm : Zn4WriteResYMMPair<WriteFAddY, [Zn4FPFAdd01], 3, [1], 1>; // Floating point add/sub (YMM).
900defm : Zn4WriteResZMMPair<WriteFAddZ, [Zn4FPFAdd01], 3, [2], 1>; // Floating point add/sub (ZMM).
901defm : Zn4WriteResXMMPair<WriteFAdd64, [Zn4FPFAdd01], 3, [1], 1>;  // Floating point double add/sub.
902defm : Zn4WriteResXMMPair<WriteFAdd64X, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (XMM).
903defm : Zn4WriteResYMMPair<WriteFAdd64Y, [Zn4FPFAdd01], 3, [1], 1>; // Floating point double add/sub (YMM).
904defm : Zn4WriteResZMMPair<WriteFAdd64Z, [Zn4FPFAdd01], 3, [2], 1>; // Floating point double add/sub (ZMM).
905defm : Zn4WriteResXMMPair<WriteFCmp, [Zn4FPFMul01], 2, [2], 1>;  // Floating point compare.
906defm : Zn4WriteResXMMPair<WriteFCmpX, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (XMM).
907defm : Zn4WriteResYMMPair<WriteFCmpY, [Zn4FPFMul01], 2, [1], 1>; // Floating point compare (YMM).
908defm : Zn4WriteResZMMPair<WriteFCmpZ, [Zn4FPFMul01], 2, [2], 1>; // Floating point compare (ZMM).
909defm : Zn4WriteResXMMPair<WriteFCmp64, [Zn4FPFMul01], 1, [1], 1>;  // Floating point double compare.
910defm : Zn4WriteResXMMPair<WriteFCmp64X, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (XMM).
911defm : Zn4WriteResYMMPair<WriteFCmp64Y, [Zn4FPFMul01], 2, [1], 1>; // Floating point double compare (YMM).
912defm : Zn4WriteResZMMPair<WriteFCmp64Z, [Zn4FPFMul01], 2, [2], 1>; // Floating point double compare (ZMM).
913defm : Zn4WriteResXMMPair<WriteFCom, [Zn4FPFMul01], 3, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point compare to flags (X87).
914defm : Zn4WriteResXMMPair<WriteFComX, [Zn4FPFMul01], 4, [2], 2>;  // FIXME: latency not from llvm-exegesis // Floating point compare to flags (SSE).
915defm : Zn4WriteResXMMPair<WriteFMul, [Zn4FPFMul01], 3, [1], 1>;  // Floating point multiplication.
916defm : Zn4WriteResXMMPair<WriteFMulX, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (XMM).
917defm : Zn4WriteResYMMPair<WriteFMulY, [Zn4FPFMul01], 3, [1], 1>; // Floating point multiplication (YMM).
918defm : Zn4WriteResZMMPair<WriteFMulZ, [Zn4FPFMul01], 3, [2], 1>; // Floating point multiplication (ZMM).
919defm : Zn4WriteResXMMPair<WriteFMul64, [Zn4FPFMul01], 3, [1], 1>;  // Floating point double multiplication.
920defm : Zn4WriteResXMMPair<WriteFMul64X, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (XMM).
921defm : Zn4WriteResYMMPair<WriteFMul64Y, [Zn4FPFMul01], 3, [1], 1>; // Floating point double multiplication (YMM).
922defm : Zn4WriteResZMMPair<WriteFMul64Z, [Zn4FPFMul01], 3, [2], 1>; // Floating point double multiplication (ZMM).
923defm : Zn4WriteResXMMPair<WriteFDiv, [Zn4FPFDiv], 11, [3], 1>;  // Floating point division.
924defm : Zn4WriteResXMMPair<WriteFDivX, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (XMM).
925defm : Zn4WriteResYMMPair<WriteFDivY, [Zn4FPFDiv], 11, [3], 1>; // Floating point division (YMM).
926defm : Zn4WriteResZMMPair<WriteFDivZ, [Zn4FPFDiv], 11, [6], 1>; // Floating point division (ZMM).
927defm : Zn4WriteResXMMPair<WriteFDiv64, [Zn4FPFDiv], 13, [5], 1>;  // Floating point double division.
928defm : Zn4WriteResXMMPair<WriteFDiv64X, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (XMM).
929defm : Zn4WriteResYMMPair<WriteFDiv64Y, [Zn4FPFDiv], 13, [5], 1>; // Floating point double division (YMM).
930defm : Zn4WriteResZMMPair<WriteFDiv64Z, [Zn4FPFDiv], 13, [10], 1>; // Floating point double division (ZMM).
931defm : Zn4WriteResXMMPair<WriteFSqrt, [Zn4FPFDiv], 15, [5], 1>;   // Floating point square root.
932defm : Zn4WriteResXMMPair<WriteFSqrtX, [Zn4FPFDiv], 15, [5], 1>;  // Floating point square root (XMM).
933defm : Zn4WriteResYMMPair<WriteFSqrtY, [Zn4FPFDiv], 15, [5], 1>;  // Floating point square root (YMM).
934defm : Zn4WriteResZMMPair<WriteFSqrtZ, [Zn4FPFDiv], 15, [10], 1>;  // Floating point square root (ZMM).
935defm : Zn4WriteResXMMPair<WriteFSqrt64, [Zn4FPFDiv], 21, [9], 1>;  // Floating point double square root.
936defm : Zn4WriteResXMMPair<WriteFSqrt64X, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (XMM).
937defm : Zn4WriteResYMMPair<WriteFSqrt64Y, [Zn4FPFDiv], 21, [9], 1>; // Floating point double square root (YMM).
938defm : Zn4WriteResZMMPair<WriteFSqrt64Z, [Zn4FPFDiv], 21, [18], 1>; // Floating point double square root (ZMM).
939defm : Zn4WriteResXMMPair<WriteFSqrt80, [Zn4FPFDiv], 22, [23], 1>; // FIXME: latency not from llvm-exegesis  // Floating point long double square root.
940defm : Zn4WriteResXMMPair<WriteFRcp, [Zn4FPFMul01], 4, [1], 1>;  // Floating point reciprocal estimate.
941defm : Zn4WriteResXMMPair<WriteFRcpX, [Zn4FPFMul01], 4, [1], 1>; // Floating point reciprocal estimate (XMM).
942defm : Zn4WriteResYMMPair<WriteFRcpY, [Zn4FPFMul01], 5, [1], 1>; // Floating point reciprocal estimate (YMM).
943defm : Zn4WriteResZMMPair<WriteFRcpZ, [Zn4FPFMul01], 5, [2], 1>; // Floating point reciprocal estimate (ZMM).
944defm : Zn4WriteResXMMPair<WriteFRsqrt, [Zn4FPFDiv], 4, [1], 1>;  // Floating point reciprocal square root estimate.
945defm : Zn4WriteResXMMPair<WriteFRsqrtX, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (XMM).
946defm : Zn4WriteResYMMPair<WriteFRsqrtY, [Zn4FPFDiv], 4, [1], 1>; // Floating point reciprocal square root estimate (YMM).
947defm : Zn4WriteResZMMPair<WriteFRsqrtZ, [Zn4FPFDiv], 5, [2], 1>; // Floating point reciprocal square root estimate (ZMM).
948defm : Zn4WriteResXMMPair<WriteFMA, [Zn4FPFMul01], 4, [2], 1>;  // Fused Multiply Add.
949defm : Zn4WriteResXMMPair<WriteFMAX, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (XMM).
950defm : Zn4WriteResYMMPair<WriteFMAY, [Zn4FPFMul01], 4, [1], 1>; // Fused Multiply Add (YMM).
951defm : Zn4WriteResZMMPair<WriteFMAZ, [Zn4FPFMul01], 4, [2], 1>; // Fused Multiply Add (ZMM).
952defm : Zn4WriteResXMMPair<WriteDPPD, [Zn4FPFMul01], 7, [6], 3, /*LoadUOps=*/2>; // Floating point double dot product.
953defm : Zn4WriteResXMMPair<WriteDPPS, [Zn4FPFMul01], 11, [8], 8, /*LoadUOps=*/2>; // Floating point single dot product.
954defm : Zn4WriteResYMMPair<WriteDPPSY, [Zn4FPFMul01], 11, [8], 7, /*LoadUOps=*/1>; // Floating point single dot product (YMM).
955defm : Zn4WriteResXMMPair<WriteFSign, [Zn4FPFMul01], 1, [2], 1>; // FIXME: latency not from llvm-exegesis  // Floating point fabs/fchs.
956defm : Zn4WriteResXMMPair<WriteFRnd, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding.
957defm : Zn4WriteResYMMPair<WriteFRndY, [Zn4FPFCvt01], 3, [1], 1>; // Floating point rounding (YMM).
958defm : Zn4WriteResZMMPair<WriteFRndZ, [Zn4FPFCvt01], 3, [2], 1>; // Floating point rounding (ZMM).
959
960defm : Zn4WriteResXMMPair<WriteFLogic, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals.
961defm : Zn4WriteResYMMPair<WriteFLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Floating point and/or/xor logicals (YMM).
962defm : Zn4WriteResZMMPair<WriteFLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Floating point and/or/xor logicals (ZMM).
963defm : Zn4WriteResXMMPair<WriteFTest, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions.
964defm : Zn4WriteResYMMPair<WriteFTestY, [Zn4FPFMisc12], 1, [2], 2>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (YMM).
965defm : Zn4WriteResZMMPair<WriteFTestZ, [Zn4FPFMisc12], 1, [4], 1>; // FIXME: latency not from llvm-exegesis // Floating point TEST instructions (ZMM).
966defm : Zn4WriteResXMMPair<WriteFShuffle, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles.
967defm : Zn4WriteResYMMPair<WriteFShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Floating point vector shuffles (YMM).
968defm : Zn4WriteResZMMPair<WriteFShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Floating point vector shuffles (ZMM).
969defm : Zn4WriteResXMMPair<WriteFVarShuffle, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles.
970defm : Zn4WriteResYMMPair<WriteFVarShuffleY, [Zn4FPVShuf01], 3, [1], 1>; // Floating point vector variable shuffles (YMM).
971defm : Zn4WriteResZMMPair<WriteFVarShuffleZ, [Zn4FPVShuf01], 3, [2], 1>; // Floating point vector variable shuffles (ZMM).
972defm : Zn4WriteResXMMPair<WriteFBlend, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends.
973defm : Zn4WriteResYMMPair<WriteFBlendY, [Zn4FPFMul01], 1, [1], 1>; // Floating point vector blends (YMM).
974defm : Zn4WriteResZMMPair<WriteFBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Floating point vector blends (ZMM).
975defm : Zn4WriteResXMMPair<WriteFVarBlend, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends.
976defm : Zn4WriteResYMMPair<WriteFVarBlendY, [Zn4FPFMul01], 1, [1], 1>; // Fp vector variable blends (YMM).
977defm : Zn4WriteResZMMPair<WriteFVarBlendZ, [Zn4FPFMul01], 1, [2], 1>; // Fp vector variable blends (ZMM).
978
979// Horizontal Add/Sub (float and integer)
980defm : Zn4WriteResXMMPair<WriteFHAdd, [Zn4FPFAdd0], 4, [2], 3>;
981defm : Zn4WriteResYMMPair<WriteFHAddY, [Zn4FPFAdd0], 4, [2], 3, /*LoadUOps=*/1>;
982defm : Zn4WriteResZMMPair<WriteFHAddZ, [Zn4FPFAdd0], 6, [4], 3, /*LoadUOps=*/1>;
983defm : Zn4WriteResXMMPair<WritePHAdd, [Zn4FPVAdd0], 2, [2], 3, /*LoadUOps=*/1>;
984defm : Zn4WriteResXMMPair<WritePHAddX, [Zn4FPVAdd0], 2, [2], 3>;
985defm : Zn4WriteResYMMPair<WritePHAddY, [Zn4FPVAdd0], 3, [3], 3, /*LoadUOps=*/1>;
986defm : Zn4WriteResZMMPair<WritePHAddZ, [Zn4FPVAdd0], 2, [4], 3, /*LoadUOps=*/1>;
987
988// Vector integer operations.
989defm : Zn4WriteResXMM<WriteVecLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
990defm : Zn4WriteResXMM<WriteVecLoadX, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
991defm : Zn4WriteResYMM<WriteVecLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
992defm : Zn4WriteResXMM<WriteVecLoadNT, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
993defm : Zn4WriteResYMM<WriteVecLoadNTY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
994defm : Zn4WriteResXMM<WriteVecMaskedLoad, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
995defm : Zn4WriteResYMM<WriteVecMaskedLoadY, [Zn4FPLd01, Zn4Load], !add(Znver4Model.VecLoadLatency, 1), [1, 1], 1>;
996defm : Zn4WriteResXMM<WriteVecStore, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
997defm : Zn4WriteResXMM<WriteVecStoreX, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
998
999def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
1000  let Latency = 4;
1001  let ReleaseAtCycles = [1];
1002  let NumMicroOps = 1;
1003}
1004def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>;
1005
1006def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
1007  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1008  let ReleaseAtCycles = [1, 1, 1];
1009  let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
1010}
1011def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>;
1012
1013def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
1014  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
1015  let ReleaseAtCycles = [1, 1, 1];
1016  let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
1017}
1018def : InstRW<[Zn4WriteVINSERTF128rmr], (instrs VINSERTF128rmi)>;
1019
1020defm : Zn4WriteResYMM<WriteVecStoreY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1021defm : Zn4WriteResXMM<WriteVecStoreNT, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1022defm : Zn4WriteResYMM<WriteVecStoreNTY, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [1, 1], 1>;
1023defm : Zn4WriteResXMM<WriteVecMaskedStore32, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1024defm : Zn4WriteResXMM<WriteVecMaskedStore64, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [4, 1], 10>;
1025defm : Zn4WriteResYMM<WriteVecMaskedStore32Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [12, 1], 42>;
1026defm : Zn4WriteResYMM<WriteVecMaskedStore64Y, [Zn4FPSt, Zn4Store], Znver4Model.StoreLatency, [6, 1], 18>;
1027
1028defm : Zn4WriteResXMM<WriteVecMoveToGpr, [Zn4FPLd01], 1, [2], 1>;
1029defm : Zn4WriteResXMM<WriteVecMoveFromGpr, [Zn4FPLd01], 1, [2], 1>;
1030
1031def Zn4WriteMOVMMX : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1032  let Latency = 1;
1033  let ReleaseAtCycles = [1, 2];
1034  let NumMicroOps = 2;
1035}
1036def : InstRW<[Zn4WriteMOVMMX], (instrs MMX_MOVQ2FR64rr, MMX_MOVQ2DQrr)>;
1037
1038def Zn4WriteMOVMMXSlow : SchedWriteRes<[Zn4FPLd01, Zn4FPFMisc0123]> {
1039  let Latency = 1;
1040  let ReleaseAtCycles = [1, 4];
1041  let NumMicroOps = 2;
1042}
1043def : InstRW<[Zn4WriteMOVMMXSlow], (instrs MMX_MOVD64rr, MMX_MOVD64to64rr)>;
1044
1045defm : Zn4WriteResXMMPair<WriteVecALU, [Zn4FPVAdd0123], 1, [1], 1>;  // Vector integer ALU op, no logicals.
1046
1047def Zn4WriteEXTRQ_INSERTQ : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1048  let Latency = 3;
1049  let ReleaseAtCycles = [1, 1];
1050  let NumMicroOps = 1;
1051}
1052def : InstRW<[Zn4WriteEXTRQ_INSERTQ], (instrs EXTRQ, INSERTQ)>;
1053
1054def Zn4WriteEXTRQI_INSERTQI : SchedWriteRes<[Zn4FPVShuf01, Zn4FPLd01]> {
1055  let Latency = 3;
1056  let ReleaseAtCycles = [1, 1];
1057  let NumMicroOps = 2;
1058}
1059def : InstRW<[Zn4WriteEXTRQI_INSERTQI], (instrs EXTRQI, INSERTQI)>;
1060
1061defm : Zn4WriteResXMMPair<WriteVecALUX, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (XMM).
1062
1063def Zn4WriteVecALUXSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1064  let Latency = 2;
1065  let ReleaseAtCycles = [2];
1066  let NumMicroOps = 1;
1067}
1068def : InstRW<[Zn4WriteVecALUXSlow], (instrs PABSBrr, PABSDrr, PABSWrr,
1069                                            PADDSBrr, PADDSWrr, PADDUSBrr, PADDUSWrr,
1070                                            PAVGBrr, PAVGWrr,
1071                                            PSIGNBrr, PSIGNDrr, PSIGNWrr,
1072                                            VPABSBrr, VPABSDrr, VPABSWrr,
1073                                            VPADDSBrr, VPADDSWrr, VPADDUSBrr, VPADDUSWrr,
1074                                            VPAVGBrr, VPAVGWrr,
1075                                            VPCMPEQQrr,
1076                                            VPSIGNBrr, VPSIGNDrr, VPSIGNWrr,
1077                                            PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr)>;
1078
1079def Zn4WriteVecOpMask : SchedWriteRes<[Zn4FPOpMask01]> {
1080  let Latency = 1;
1081  let ReleaseAtCycles = [1];
1082  let NumMicroOps = 1;
1083}
1084def : InstRW<[Zn4WriteVecOpMask], (instrs   KADDBkk, KADDDkk, KADDQkk, KADDWkk,
1085                                            KANDBkk, KANDDkk, KANDQkk, KANDWkk,
1086                                            KANDNBkk, KANDNDkk, KANDNQkk, KANDNWkk,
1087                                            KMOVBkk, KMOVDkk, KMOVQkk, KMOVWkk,
1088                                            KMOVBrk, KMOVDrk, KMOVQrk, KMOVWrk,
1089                                            KNOTBkk, KNOTDkk, KNOTQkk, KNOTWkk,
1090                                            KORBkk, KORDkk, KORQkk, KORWkk,
1091                                            KORTESTBkk, KORTESTDkk, KORTESTQkk, KORTESTWkk,
1092                                            KTESTBkk, KTESTDkk, KTESTQkk, KTESTWkk,
1093                                            KUNPCKBWkk, KUNPCKDQkk, KUNPCKWDkk,
1094                                            KXNORBkk, KXNORDkk, KXNORQkk, KXNORWkk,
1095                                            KXORBkk, KXORDkk, KXORQkk, KXORWkk)>;
1096
1097def Zn4WriteVecOpMaskMemMov : SchedWriteRes<[Zn4FPOpMask4]> {
1098  let Latency = 1;
1099  let ReleaseAtCycles = [1];
1100  let NumMicroOps = 1;
1101}
1102def : InstRW<[Zn4WriteVecOpMaskMemMov], (instrs KMOVBmk, KMOVDmk, KMOVQmk, KMOVWmk)>;
1103
1104def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
1105  let Latency = 1;
1106  let ReleaseAtCycles = [1];
1107  let NumMicroOps = 1;
1108}
1109def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
1110
1111def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1112  // TODO: All align instructions are expected to be of 4 cycle latency
1113  let Latency = 4;
1114  let ReleaseAtCycles = [1];
1115  let NumMicroOps = 1;
1116}
1117def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1118                                            VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1119                                            >;
1120defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1121
1122def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
1123  let Latency = 1;
1124  let ReleaseAtCycles = [1];
1125  let NumMicroOps = 1;
1126}
1127def : InstRW<[Zn4WriteVecALUYSlow], (instrs VPABSBYrr, VPABSDYrr, VPABSWYrr,
1128                                            VPADDSBYrr, VPADDSWYrr, VPADDUSBYrr, VPADDUSWYrr,
1129                                            VPSUBSBYrr, VPSUBSWYrr, VPSUBUSBYrr, VPSUBUSWYrr,
1130                                            VPAVGBYrr, VPAVGWYrr,
1131                                            VPCMPEQQYrr,
1132                                            VPSIGNBYrr, VPSIGNDYrr, VPSIGNWYrr)>;
1133
1134defm : Zn4WriteResZMMPair<WriteVecALUZ, [Zn4FPVAdd0123], 1, [2], 1>; // Vector integer ALU op, no logicals (ZMM).
1135
1136defm : Zn4WriteResXMMPair<WriteVecLogic, [Zn4FPVMisc0123], 1, [1], 1>;  // Vector integer and/or/xor logicals.
1137defm : Zn4WriteResXMMPair<WriteVecLogicX, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (XMM).
1138defm : Zn4WriteResYMMPair<WriteVecLogicY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector integer and/or/xor logicals (YMM).
1139defm : Zn4WriteResZMMPair<WriteVecLogicZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector integer and/or/xor logicals (ZMM).
1140defm : Zn4WriteResXMMPair<WriteVecTest, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>;  // FIXME: latency not from llvm-exegesis // Vector integer TEST instructions.
1141defm : Zn4WriteResYMMPair<WriteVecTestY, [Zn4FPVAdd12, Zn4FPSt], 1, [1, 1], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (YMM).
1142defm : Zn4WriteResZMMPair<WriteVecTestZ, [Zn4FPVAdd12, Zn4FPSt], 1, [2, 2], 2>; // FIXME: latency not from llvm-exegesis  // Vector integer TEST instructions (ZMM).
1143defm : Zn4WriteResXMMPair<WriteVecShift, [Zn4FPVShift01], 1, [1], 1>;  // Vector integer shifts (default).
1144defm : Zn4WriteResXMMPair<WriteVecShiftX, [Zn4FPVShift01], 2, [2], 1>; // Vector integer shifts (XMM).
1145defm : Zn4WriteResYMMPair<WriteVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer shifts (YMM).
1146defm : Zn4WriteResZMMPair<WriteVecShiftZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer shifts (ZMM).
1147defm : Zn4WriteResXMMPair<WriteVecShiftImm, [Zn4FPVShift01], 1, [1], 1>;  // Vector integer immediate shifts (default).
1148defm : Zn4WriteResXMMPair<WriteVecShiftImmX, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (XMM).
1149defm : Zn4WriteResYMMPair<WriteVecShiftImmY, [Zn4FPVShift01], 1, [1], 1>; // Vector integer immediate shifts (YMM).
1150defm : Zn4WriteResZMMPair<WriteVecShiftImmZ, [Zn4FPVShift01], 1, [2], 1>; // Vector integer immediate shifts (ZMM).
1151defm : Zn4WriteResXMMPair<WriteVecIMul, [Zn4FPVMul01], 3, [1], 1>;  // Vector integer multiply (default).
1152defm : Zn4WriteResXMMPair<WriteVecIMulX, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (XMM).
1153defm : Zn4WriteResYMMPair<WriteVecIMulY, [Zn4FPVMul01], 3, [1], 1>; // Vector integer multiply (YMM).
1154defm : Zn4WriteResZMMPair<WriteVecIMulZ, [Zn4FPVMul01], 3, [2], 1>; // Vector integer multiply (ZMM).
1155defm : Zn4WriteResXMMPair<WritePMULLD, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD.
1156defm : Zn4WriteResYMMPair<WritePMULLDY, [Zn4FPVMul01], 3, [1], 1>; // Vector PMULLD (YMM).
1157defm : Zn4WriteResZMMPair<WritePMULLDZ, [Zn4FPVMul01], 3, [2], 1>; // Vector PMULLD (ZMM).
1158defm : Zn4WriteResXMMPair<WriteShuffle, [Zn4FPVShuf01], 1, [1], 1>;  // Vector shuffles.
1159defm : Zn4WriteResXMMPair<WriteShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (XMM).
1160defm : Zn4WriteResYMMPair<WriteShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector shuffles (YMM).
1161defm : Zn4WriteResZMMPair<WriteShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector shuffles (ZMM).
1162defm : Zn4WriteResXMMPair<WriteVarShuffle, [Zn4FPVShuf01], 1, [1], 1>;  // Vector variable shuffles.
1163defm : Zn4WriteResXMMPair<WriteVarShuffleX, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (XMM).
1164defm : Zn4WriteResYMMPair<WriteVarShuffleY, [Zn4FPVShuf01], 1, [1], 1>; // Vector variable shuffles (YMM).
1165defm : Zn4WriteResZMMPair<WriteVarShuffleZ, [Zn4FPVShuf01], 1, [2], 1>; // Vector variable shuffles (ZMM).
1166defm : Zn4WriteResXMMPair<WriteBlend, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends.
1167defm : Zn4WriteResYMMPair<WriteBlendY, [Zn4FPVMisc0123], 1, [1], 1>; // Vector blends (YMM).
1168defm : Zn4WriteResZMMPair<WriteBlendZ, [Zn4FPVMisc0123], 1, [2], 1>; // Vector blends (ZMM).
1169defm : Zn4WriteResXMMPair<WriteVarBlend, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends.
1170defm : Zn4WriteResYMMPair<WriteVarBlendY, [Zn4FPVMul01], 1, [1], 1>; // Vector variable blends (YMM).
1171defm : Zn4WriteResZMMPair<WriteVarBlendZ, [Zn4FPVMul01], 1, [2], 1>; // Vector variable blends (ZMM).
1172defm : Zn4WriteResXMMPair<WritePSADBW, [Zn4FPVAdd0123], 3, [2], 1>;  // Vector PSADBW.
1173defm : Zn4WriteResXMMPair<WritePSADBWX, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (XMM).
1174defm : Zn4WriteResYMMPair<WritePSADBWY, [Zn4FPVAdd0123], 3, [2], 1>; // Vector PSADBW (YMM).
1175defm : Zn4WriteResZMMPair<WritePSADBWZ, [Zn4FPVAdd0123], 4, [4], 1>; // Vector PSADBW (ZMM).
1176defm : Zn4WriteResXMMPair<WriteMPSAD, [Zn4FPVAdd0123], 4, [8], 4, /*LoadUOps=*/2>; // Vector MPSAD.
1177defm : Zn4WriteResYMMPair<WriteMPSADY, [Zn4FPVAdd0123], 4, [8], 3, /*LoadUOps=*/1>; // Vector MPSAD (YMM).
1178defm : Zn4WriteResZMMPair<WriteMPSADZ, [Zn4FPVAdd0123], 4, [16], 3, /*LoadUOps=*/1>; // Vector MPSAD (ZMM).
1179defm : Zn4WriteResXMMPair<WritePHMINPOS, [Zn4FPVAdd01], 3, [1], 1>;  // Vector PHMINPOS.
1180
1181// Vector insert/extract operations.
1182defm : Zn4WriteResXMMPair<WriteVecInsert, [Zn4FPLd01], 1, [2], 2, /*LoadUOps=*/-1>; // Insert gpr to vector element.
1183defm : Zn4WriteResXMM<WriteVecExtract, [Zn4FPLd01], 1, [2], 2>; // Extract vector element to gpr.
1184defm : Zn4WriteResXMM<WriteVecExtractSt, [Zn4FPSt, Zn4Store], !add(1, Znver4Model.StoreLatency), [1, 1], 2>; // Extract vector element and store.
1185
1186// MOVMSK operations.
1187defm : Zn4WriteResXMM<WriteFMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1188defm : Zn4WriteResXMM<WriteVecMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1189defm : Zn4WriteResYMM<WriteVecMOVMSKY, [Zn4FPVMisc2], 1, [1], 1>;
1190defm : Zn4WriteResXMM<WriteMMXMOVMSK, [Zn4FPVMisc2], 1, [1], 1>;
1191
1192// Conversion between integer and float.
1193defm : Zn4WriteResXMMPair<WriteCvtSD2I, [Zn4FPFCvt01], 1, [1], 1>;  // Double -> Integer.
1194defm : Zn4WriteResXMMPair<WriteCvtPD2I, [Zn4FPFCvt01], 3, [2], 1>; // Double -> Integer (XMM).
1195defm : Zn4WriteResYMMPair<WriteCvtPD2IY, [Zn4FPFCvt01], 3, [2], 2>; // Double -> Integer (YMM).
1196defm : Zn4WriteResZMMPair<WriteCvtPD2IZ, [Zn4FPFCvt01], 3, [4], 2>; // Double -> Integer (ZMM).
1197
1198def Zn4WriteCvtPD2IMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1199  let Latency = 1;
1200  let ReleaseAtCycles = [2];
1201  let NumMicroOps = 2;
1202}
1203defm : Zn4WriteResXMMPair<WriteCvtSS2I, [Zn4FPFCvt01], 5, [5], 2>;  // Float -> Integer.
1204
1205defm : Zn4WriteResXMMPair<WriteCvtPS2I, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Integer (XMM).
1206defm : Zn4WriteResYMMPair<WriteCvtPS2IY, [Zn4FPFCvt01], 4, [1], 1>; // Float -> Integer (YMM).
1207defm : Zn4WriteResZMMPair<WriteCvtPS2IZ, [Zn4FPFCvt01], 4, [2], 2>; // Float -> Integer (ZMM).
1208
1209defm : Zn4WriteResXMMPair<WriteCvtI2SD, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Double.
1210defm : Zn4WriteResXMMPair<WriteCvtI2PD, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Double (XMM).
1211defm : Zn4WriteResYMMPair<WriteCvtI2PDY, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Double (YMM).
1212defm : Zn4WriteResZMMPair<WriteCvtI2PDZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Integer -> Double (ZMM).
1213
1214def Zn4WriteCvtI2PDMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1215  let Latency = 2;
1216  let ReleaseAtCycles = [6];
1217  let NumMicroOps = 2;
1218}
1219
1220defm : Zn4WriteResXMMPair<WriteCvtI2SS, [Zn4FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>;  // Integer -> Float.
1221defm : Zn4WriteResXMMPair<WriteCvtI2PS, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM).
1222defm : Zn4WriteResYMMPair<WriteCvtI2PSY, [Zn4FPFCvt01], 3, [1], 1>; // Integer -> Float (YMM).
1223defm : Zn4WriteResZMMPair<WriteCvtI2PSZ, [Zn4FPFCvt01], 3, [2], 2>; // Integer -> Float (ZMM).
1224
1225def Zn4WriteCvtI2PSMMX : SchedWriteRes<[Zn4FPFCvt01]> {
1226  let Latency = 3;
1227  let ReleaseAtCycles = [1];
1228  let NumMicroOps = 2;
1229}
1230
1231defm : Zn4WriteResXMMPair<WriteCvtSS2SD, [Zn4FPFCvt01], 3, [1], 1>;  // Float -> Double size conversion.
1232defm : Zn4WriteResXMMPair<WriteCvtPS2PD, [Zn4FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM).
1233defm : Zn4WriteResYMMPair<WriteCvtPS2PDY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Float -> Double size conversion (YMM).
1234defm : Zn4WriteResZMMPair<WriteCvtPS2PDZ, [Zn4FPFCvt01], 6, [4], 4, /*LoadUOps=*/-1>; // Float -> Double size conversion (ZMM).
1235
1236defm : Zn4WriteResXMMPair<WriteCvtSD2SS, [Zn4FPFCvt01], 3, [1], 1>;  // Double -> Float size conversion.
1237defm : Zn4WriteResXMMPair<WriteCvtPD2PS, [Zn4FPFCvt01], 3, [1], 1>; // Double -> Float size conversion (XMM).
1238defm : Zn4WriteResYMMPair<WriteCvtPD2PSY, [Zn4FPFCvt01], 6, [2], 2>; // Double -> Float size conversion (YMM).
1239defm : Zn4WriteResZMMPair<WriteCvtPD2PSZ, [Zn4FPFCvt01], 6, [4], 4>; // Double -> Float size conversion (ZMM).
1240
1241defm : Zn4WriteResXMMPair<WriteCvtPH2PS, [Zn4FPFCvt01], 3, [1], 1>; // Half -> Float size conversion.
1242defm : Zn4WriteResYMMPair<WriteCvtPH2PSY, [Zn4FPFCvt01], 4, [2], 2, /*LoadUOps=*/-1>; // Half -> Float size conversion (YMM).
1243defm : Zn4WriteResZMMPair<WriteCvtPH2PSZ, [Zn4FPFCvt01], 4, [4], 4, /*LoadUOps=*/-1>; // Half -> Float size conversion (ZMM).
1244
1245defm : Zn4WriteResXMM<WriteCvtPS2PH, [Zn4FPFCvt01], 3, [2], 1>; // Float -> Half size conversion.
1246defm : Zn4WriteResYMM<WriteCvtPS2PHY, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (YMM).
1247defm : Zn4WriteResZMM<WriteCvtPS2PHZ, [Zn4FPFCvt01], 6, [2], 2>; // Float -> Half size conversion (ZMM).
1248
1249defm : Zn4WriteResXMM<WriteCvtPS2PHSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(3, Znver4Model.StoreLatency), [1, 1, 1], 2>; // Float -> Half + store size conversion.
1250defm : Zn4WriteResYMM<WriteCvtPS2PHYSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (YMM).
1251defm : Zn4WriteResYMM<WriteCvtPS2PHZSt, [Zn4FPFCvt01, Zn4FPSt, Zn4Store], !add(6, Znver4Model.StoreLatency), [2, 1, 1], 3>; // Float -> Half + store size conversion (ZMM).
1252
1253// CRC32 instruction.
1254defm : Zn4WriteResIntPair<WriteCRC32, [Zn4ALU1], 3, [1], 1>;
1255
1256def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1257  let Latency = 2;
1258  let ReleaseAtCycles = [2];
1259  let NumMicroOps = 2;
1260}
1261def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
1262
1263def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1264  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
1265  let ReleaseAtCycles = [1, 1, 2];
1266  let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
1267}
1268def : InstRW<[Zn4WriteSHA1MSG1rm], (instrs SHA1MSG1rm)>;
1269
1270def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
1271  let Latency = 1;
1272  let ReleaseAtCycles = [2];
1273  let NumMicroOps = 1;
1274}
1275def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
1276
1277def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1278  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
1279  let ReleaseAtCycles = [1, 1, 2];
1280  let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
1281}
1282def : InstRW<[Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm], (instrs SHA1MSG2rm, SHA1NEXTErm)>;
1283
1284def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
1285  let Latency = 2;
1286  let ReleaseAtCycles = [3];
1287  let NumMicroOps = 2;
1288}
1289def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
1290
1291def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1292  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
1293  let ReleaseAtCycles = [1, 1, 3];
1294  let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
1295}
1296def : InstRW<[Zn4Writerm_SHA256MSG1rm], (instrs SHA256MSG1rm)>;
1297
1298def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
1299  let Latency = 3;
1300  let ReleaseAtCycles = [8];
1301  let NumMicroOps = 4;
1302}
1303def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
1304
1305def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
1306  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
1307  let ReleaseAtCycles = [1, 1, 8];
1308  let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
1309}
1310def : InstRW<[Zn4WriteSHA256MSG2rm], (instrs SHA256MSG2rm)>;
1311
1312def Zn4WriteSHA1RNDS4rri : SchedWriteRes<[Zn4FPU0123]> {
1313  let Latency = 6;
1314  let ReleaseAtCycles = [8];
1315  let NumMicroOps = 1;
1316}
1317def : InstRW<[Zn4WriteSHA1RNDS4rri], (instrs SHA1RNDS4rri)>;
1318
1319def Zn4WriteSHA256RNDS2rr : SchedWriteRes<[Zn4FPU0123]> {
1320  let Latency = 4;
1321  let ReleaseAtCycles = [8];
1322  let NumMicroOps = 1;
1323}
1324def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
1325
1326// Strings instructions.
1327// Packed Compare Implicit Length Strings, Return Mask
1328defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1329// Packed Compare Explicit Length Strings, Return Mask
1330defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1331// Packed Compare Implicit Length Strings, Return Index
1332defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
1333// Packed Compare Explicit Length Strings, Return Index
1334defm : Zn4WriteResXMMPair<WritePCmpEStrI, [Zn4FPVAdd0123], 6, [12], 8, /*LoadUOps=*/4>;
1335
1336// AES instructions.
1337defm : Zn4WriteResXMMPair<WriteAESDecEnc, [Zn4FPAES01], 4, [1], 1>; // Decryption, encryption.
1338defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn.
1339defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
1340
1341// Carry-less multiplication instructions.
1342defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
1343
1344// EMMS/FEMMS
1345defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
1346
1347// Load/store MXCSR
1348defm : Zn4WriteResInt<WriteLDMXCSR, [Zn4AGU012, Zn4Load, Zn4ALU0123], !add(Znver4Model.LoadLatency, 1), [1, 1, 6], 1>; // FIXME: latency not from llvm-exegesis
1349defm : Zn4WriteResInt<WriteSTMXCSR, [Zn4ALU0123, Zn4AGU012, Zn4Store], !add(1, Znver4Model.StoreLatency), [60, 1, 1], 2>; // FIXME: latency not from llvm-exegesis
1350
1351// Catch-all for expensive system instructions.
1352defm : Zn4WriteResInt<WriteSystem, [Zn4ALU0123], 100, [100], 100>;
1353
1354def Zn4WriteVZEROUPPER : SchedWriteRes<[Zn4FPU0123]> {
1355  let Latency = 0; // FIXME: not from llvm-exegesis
1356  let ReleaseAtCycles = [1];
1357  let NumMicroOps = 1;
1358}
1359def : InstRW<[Zn4WriteVZEROUPPER], (instrs VZEROUPPER)>;
1360
1361def Zn4WriteVZEROALL : SchedWriteRes<[Zn4FPU0123]> {
1362  let Latency = 10; // FIXME: not from llvm-exegesis
1363  let ReleaseAtCycles = [24];
1364  let NumMicroOps = 18;
1365}
1366def : InstRW<[Zn4WriteVZEROALL], (instrs VZEROALL)>;
1367
1368// AVX2.
1369defm : Zn4WriteResYMMPair<WriteFShuffle256, [Zn4FPVShuf], 2, [1], 1, /*LoadUOps=*/2>; // Fp 256-bit width vector shuffles.
1370defm : Zn4WriteResYMMPair<WriteFVarShuffle256, [Zn4FPVShuf], 7, [1], 2, /*LoadUOps=*/1>; // Fp 256-bit width variable shuffles.
1371defm : Zn4WriteResYMMPair<WriteShuffle256, [Zn4FPVShuf], 1, [1], 1>; // 256-bit width vector shuffles.
1372
1373def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
1374  let Latency = 3;
1375  let ReleaseAtCycles = [1];
1376  let NumMicroOps = 1;
1377}
1378def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>;
1379
1380def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1381  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
1382  let ReleaseAtCycles = [1, 1, 1];
1383  let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
1384}
1385def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
1386
1387def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
1388  let Latency = 7;
1389  let ReleaseAtCycles = [1];
1390  let NumMicroOps = 2;
1391}
1392def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
1393
1394def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1395  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
1396  let ReleaseAtCycles = [1, 1, 2];
1397  let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
1398}
1399def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
1400
1401def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
1402  let Latency = 6;
1403  let ReleaseAtCycles = [1];
1404  let NumMicroOps = 2;
1405}
1406def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
1407
1408def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1409  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
1410  let ReleaseAtCycles = [1, 1, 2];
1411  let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
1412}
1413def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
1414
1415def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
1416  let Latency = 5;
1417  let ReleaseAtCycles = [1];
1418  let NumMicroOps = 2;
1419}
1420def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
1421
1422def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
1423  let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
1424  let ReleaseAtCycles = [1, 1, 2];
1425  let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
1426}
1427def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
1428
1429defm : Zn4WriteResYMMPair<WriteVPMOV256, [Zn4FPVShuf01], 4, [3], 2, /*LoadUOps=*/-1>; // 256-bit width packed vector width-changing move.
1430defm : Zn4WriteResYMMPair<WriteVarShuffle256, [Zn4FPVShuf01], 1, [1], 2>; // 256-bit width vector variable shuffles.
1431defm : Zn4WriteResXMMPair<WriteVarVecShift, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts.
1432defm : Zn4WriteResYMMPair<WriteVarVecShiftY, [Zn4FPVShift01], 1, [1], 1>; // Variable vector shifts (YMM).
1433defm : Zn4WriteResZMMPair<WriteVarVecShiftZ, [Zn4FPVShift01], 1, [2], 2>; // Variable vector shifts (ZMM).
1434
1435// Old microcoded instructions that nobody use.
1436defm : Zn4WriteResInt<WriteMicrocoded, [Zn4ALU0123], 100, [100], 100>;
1437
1438// Fence instructions.
1439defm : Zn4WriteResInt<WriteFence, [Zn4ALU0123], 1, [100], 1>;
1440
1441def Zn4WriteLFENCE : SchedWriteRes<[Zn4LSU]> {
1442  let Latency = 1;
1443  let ReleaseAtCycles = [30];
1444  let NumMicroOps = 1;
1445}
1446def : InstRW<[Zn4WriteLFENCE], (instrs LFENCE)>;
1447
1448def Zn4WriteSFENCE : SchedWriteRes<[Zn4LSU]> {
1449  let Latency = 1;
1450  let ReleaseAtCycles = [1];
1451  let NumMicroOps = 1;
1452}
1453def : InstRW<[Zn4WriteSFENCE], (instrs SFENCE)>;
1454
1455// Nop, not very useful expect it provides a model for nops!
1456defm : Zn4WriteResInt<WriteNop, [Zn4ALU0123], 0, [1], 1>; // FIXME: latency not from llvm-exegesis
1457
1458
1459///////////////////////////////////////////////////////////////////////////////
1460// Zero Cycle Move
1461///////////////////////////////////////////////////////////////////////////////
1462
1463def Zn4WriteZeroLatency : SchedWriteRes<[]> {
1464  let Latency = 0;
1465  let ReleaseAtCycles = [];
1466  let NumMicroOps = 1;
1467}
1468def : InstRW<[Zn4WriteZeroLatency], (instrs MOV32rr, MOV32rr_REV,
1469                                               MOV64rr, MOV64rr_REV,
1470                                               MOVSX32rr32)>;
1471
1472def Zn4WriteSwapRenameable : SchedWriteRes<[]> {
1473  let Latency = 0;
1474  let ReleaseAtCycles = [];
1475  let NumMicroOps = 2;
1476}
1477def : InstRW<[Zn4WriteSwapRenameable], (instrs XCHG32rr, XCHG32ar,
1478                                               XCHG64rr, XCHG64ar)>;
1479
1480defm : Zn4WriteResInt<WriteXCHG, [Zn4ALU0123], 0, [8], 2>;        // Compare+Exchange - TODO RMW support.
1481
1482defm : Zn4WriteResXMM<WriteFMoveX, [], 0, [], 1>;
1483defm : Zn4WriteResYMM<WriteFMoveY, [], 0, [], 1>;
1484defm : Zn4WriteResYMM<WriteFMoveZ, [], 0, [], 1>;
1485
1486defm : Zn4WriteResXMM<WriteVecMove, [Zn4FPFMisc0123], 1, [1], 1>; // MMX
1487defm : Zn4WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
1488defm : Zn4WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
1489defm : Zn4WriteResYMM<WriteVecMoveZ, [], 0, [], 1>;
1490
1491def : IsOptimizableRegisterMove<[
1492  InstructionEquivalenceClass<[
1493    // GPR variants.
1494    MOV32rr, MOV32rr_REV,
1495    MOV64rr, MOV64rr_REV,
1496    MOVSX32rr32,
1497    XCHG32rr, XCHG32ar,
1498    XCHG64rr, XCHG64ar,
1499
1500    // MMX variants.
1501    // MMX moves are *NOT* eliminated.
1502
1503    // SSE variants.
1504    MOVAPSrr, MOVAPSrr_REV,
1505    MOVUPSrr, MOVUPSrr_REV,
1506    MOVAPDrr, MOVAPDrr_REV,
1507    MOVUPDrr, MOVUPDrr_REV,
1508    MOVDQArr, MOVDQArr_REV,
1509    MOVDQUrr, MOVDQUrr_REV,
1510
1511    // AVX variants.
1512    VMOVAPSrr, VMOVAPSrr_REV,
1513    VMOVUPSrr, VMOVUPSrr_REV,
1514    VMOVAPDrr, VMOVAPDrr_REV,
1515    VMOVUPDrr, VMOVUPDrr_REV,
1516    VMOVDQArr, VMOVDQArr_REV,
1517    VMOVDQUrr, VMOVDQUrr_REV,
1518
1519    // AVX YMM variants.
1520    VMOVAPSYrr, VMOVAPSYrr_REV,
1521    VMOVUPSYrr, VMOVUPSYrr_REV,
1522    VMOVAPDYrr, VMOVAPDYrr_REV,
1523    VMOVUPDYrr, VMOVUPDYrr_REV,
1524    VMOVDQAYrr, VMOVDQAYrr_REV,
1525    VMOVDQUYrr, VMOVDQUYrr_REV,
1526  ], TruePred >
1527]>;
1528
1529// FIXUP and RANGE Instructions
1530def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
1531  let Latency = 2;
1532  let ReleaseAtCycles = [2];
1533  let NumMicroOps = 1;
1534}
1535def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
1536	"VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
1537        "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri",  "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
1538	"VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
1539	)>;
1540
1541// SCALE & REDUCE instructions
1542def Zn4WriteSCALErr: SchedWriteRes<[Zn4FPFMisc23]> {
1543  let Latency = 6;
1544  let ReleaseAtCycles = [6];
1545  let NumMicroOps = 2;
1546}
1547def : InstRW<[Zn4WriteSCALErr], (instregex
1548        "V(SCALEF|REDUCE)(S|P)(S|D)(Z?|Z128?|Z256?)(rr|rrb|rrkz|rrik|rrikz|rri)(_Int?)",
1549        "(V?)REDUCE(PD|PS|SD|SS)(Z?|Z128?)(rri|rrikz|rrib)"
1550	)>;
1551
1552//BF16PS Instructions
1553def Zn4WriteBF16: SchedWriteRes<[Zn4FPFMisc23]> {
1554  let Latency = 6;
1555  let ReleaseAtCycles = [6];
1556  let NumMicroOps = 2;
1557}
1558def : InstRW<[Zn4WriteBF16], (instregex
1559        "(V?)DPBF16PS(Z?|Z128?|Z256?)(r|rk|rkz)"
1560	)>;
1561
1562// BUSD and VPMADD Instructions
1563def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
1564  let Latency = 4;
1565  let ReleaseAtCycles = [4];
1566  let NumMicroOps = 1;
1567}
1568def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
1569	"VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
1570        "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
1571	)>;
1572
1573// SHIFT instructions
1574def Zn4WriteSHIFTrr: SchedWriteRes<[Zn4FPFMisc01]> {
1575  let Latency = 2;
1576  let ReleaseAtCycles = [2];
1577  let NumMicroOps = 1;
1578}
1579def : InstRW<[Zn4WriteSHIFTrr], (instregex
1580        "VP(LZCNT|SHLD|SHRD?)(D|Q|W|VD|VQ|VW?)(Z?|Z128?|Z256?)(rr|rk|rrk|rrkz|rri|rrik|rrikz)",
1581        "(V?)P(SLL|SRL|SRA)(D|Q|W|DQ)(Y?|Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1582        "(V?)P(SLL|SRL|SRA)DQYri",
1583        "(V?)P(SLL|SRL)DQ(Z?|Z256?)ri",
1584        "(V?)P(SHUFB)(Y|Z|Z128|Z256?)(rr|rrk|rrkz)",
1585        "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
1586        "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
1587        "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
1588	"VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
1589	)>;
1590
1591def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
1592  let Latency = 1;
1593  let ReleaseAtCycles = [1];
1594  let NumMicroOps = 1;
1595}
1596def : InstRW<[Zn4WriteSHIFTri], (instregex
1597        "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
1598	)>;
1599
1600// ALIGN Instructions
1601def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
1602  let Latency = 2;
1603  let ReleaseAtCycles = [2];
1604  let NumMicroOps = 1;
1605}
1606def : InstRW<[Zn4WriteALIGN], (instregex
1607        "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
1608	)>;
1609
1610//PACK Instructions
1611def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
1612  let Latency = 2;
1613  let ReleaseAtCycles = [2];
1614  let NumMicroOps = 1;
1615}
1616def : InstRW<[Zn4WritePACK], (instregex
1617        "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
1618	)>;
1619
1620// MAX and MIN Instructions
1621def Zn4WriteFCmp64: SchedWriteRes<[Zn4FPFMisc01]> {
1622  let Latency = 2;
1623  let ReleaseAtCycles = [2];
1624  let NumMicroOps = 1;
1625}
1626def : InstRW<[Zn4WriteFCmp64], (instregex
1627        "(V?)CMP(S|P)(S|D)(rr|rri|rr_Int)",
1628        "(V?|VP?)(MAX|MIN|MINC|MAXC)(S|P|U)(S|D|Q)(Z?|Z128?|Z256?)(rr|rri|rrk|rrkz)(_Int?)",
1629        "VP(MAX|MIN)(SQ|UQ)(Z|Z128|Z256)(rr|rrk|rrkz)",
1630        "(V?)(MAX|MAXC|MIN|MINC)PD(Z|Z128|Z256?)(rr|rrk|rrkz)"
1631	)>;
1632
1633// MOV Instructions
1634def Zn4MOVDUPZ: SchedWriteRes<[Zn4FPFMisc12]> {
1635  let Latency = 2;
1636  let ReleaseAtCycles = [2];
1637  let NumMicroOps = 1;
1638}
1639def : InstRW<[Zn4MOVDUPZ], (instregex
1640        "(V?)VMOVDDUP(Z|Z128|Z256)(rr|rrk|rrkz)"
1641	)>;
1642
1643def Zn4MOVS: SchedWriteRes<[Zn4FPFMisc12]> {
1644  let Latency = 2;
1645  let ReleaseAtCycles = [1];
1646  let NumMicroOps = 1;
1647}
1648def : InstRW<[Zn4MOVS], (instregex
1649        "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)(Y?|Z128?|Z256?)(rr|rrk|rrkz)",
1650        "(V?)PMOV(S?|US?)(DB|DW|QB|QD|QW|WB)(Z128|Z256)(rr|rrk|rrkz)"
1651	)>;
1652
1653def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> {
1654  let Latency = 4;
1655  let ReleaseAtCycles = [2];
1656  let NumMicroOps = 1;
1657}
1658def : InstRW<[Zn4MOVSZ], (instregex
1659        "(V?)PMOV(SX|ZX)(BD|BQ|BW|WD|WQ|DQ)Z(rr|rrk|rrkz)"
1660	)>;
1661
1662def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> {
1663  let Latency = 5;
1664  let ReleaseAtCycles = [2];
1665  let NumMicroOps = 1;
1666}
1667def : InstRW<[Zn4MOVSrr], (instregex
1668        "(V?)PMOV(S?|US?)(DB|DW|QB|QD|QW|WB)Z(rr|rrk|rrkz)"
1669	)>;
1670
1671
1672//VPTEST Instructions
1673def Zn4VPTESTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1674  let Latency = 3;
1675  let ReleaseAtCycles = [3];
1676  let NumMicroOps = 1;
1677}
1678def : InstRW<[Zn4VPTESTZ128], (instregex
1679        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z128?)(rrk)"
1680	)>;
1681
1682def Zn4VPTESTZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1683  let Latency = 4;
1684  let ReleaseAtCycles = [4];
1685  let NumMicroOps = 1;
1686}
1687def : InstRW<[Zn4VPTESTZ256], (instregex
1688        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z256?)(rr|rrk)"
1689	)>;
1690
1691def Zn4VPTESTZ: SchedWriteRes<[Zn4FPFMisc01]> {
1692  let Latency = 5;
1693  let ReleaseAtCycles = [5];
1694  let NumMicroOps = 1;
1695}
1696def : InstRW<[Zn4VPTESTZ], (instregex
1697        "(V?)PTEST(N?)(MB|MD|MQ|MW)(Z?)(rrk)"
1698	)>;
1699
1700// CONFLICT Instructions
1701def Zn4CONFLICTZ128: SchedWriteRes<[Zn4FPFMisc01]> {
1702  let Latency = 2;
1703  let ReleaseAtCycles = [2];
1704  let NumMicroOps = 1;
1705}
1706def : InstRW<[Zn4CONFLICTZ128], (instregex
1707        "VPCONFLICT(D|Q)(Z128)(rr|rrk|rrkz)"
1708	)>;
1709
1710def Zn4CONFLICTrr: SchedWriteRes<[Zn4FPFMisc01,Zn4FPFMisc12,Zn4FPFMisc23]> {
1711  let Latency = 6;
1712  let ReleaseAtCycles = [2,2,2];
1713  let NumMicroOps = 4;
1714}
1715def : InstRW<[Zn4CONFLICTrr], (instregex
1716        "VPCONFLICT(D|Q)(Z|Z256)(rr|rrkz)"
1717	)>;
1718
1719// RSQRT Instructions
1720def Zn4VRSQRT14PDZ256: SchedWriteRes<[Zn4FPFMisc01]> {
1721  let Latency = 5;
1722  let ReleaseAtCycles = [2];
1723  let NumMicroOps = 1;
1724}
1725def : InstRW<[Zn4VRSQRT14PDZ256], (instregex
1726        "VRSQRT14(PD|PS)(Z?|Z128?|Z256?)(r|rr|rk|rrk|rkz|rrkz)"
1727	)>;
1728
1729
1730// PERM Instructions
1731def Zn4PERMILP: SchedWriteRes<[Zn4FPFMisc123]> {
1732  let Latency = 2;
1733  let ReleaseAtCycles = [2];
1734  let NumMicroOps = 1;
1735}
1736def : InstRW<[Zn4PERMILP], (instregex
1737        "VPERMILP(S|D)(Y|Z|Z128|Z256)(rr|rrk|rrkz)"
1738	)>;
1739
1740def Zn4PERMIT2_128: SchedWriteRes<[Zn4FPFMisc12]> {
1741  let Latency = 3;
1742  let ReleaseAtCycles = [2];
1743  let NumMicroOps = 1;
1744}
1745def : InstRW<[Zn4PERMIT2_128], (instregex
1746	"VPERM(I2|T2)(PS|PD|W)Z128(rr|rrk|rrkz)",
1747	"VPERM(I2|T2)(B|D|Q)Z128(rr|rrk|rrkz)"
1748	)>;
1749
1750def Zn4PERMIT2_128rr:SchedWriteRes<[Zn4FPFMisc12]> {
1751  let Latency = 2;
1752  let ReleaseAtCycles = [2];
1753  let NumMicroOps = 1;
1754}
1755def : InstRW<[Zn4PERMIT2_128rr], (instregex
1756	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z128(rr|rrk|rrkz)",
1757	"VPERM(B|D|Q|W)(Z128?)(rr|rrk|rrkz)"
1758	)>;
1759
1760def Zn4PERMIT2_256: SchedWriteRes<[Zn4FPFMisc12]> {
1761  let Latency = 4;
1762  let ReleaseAtCycles = [2];
1763  let NumMicroOps = 1;
1764}
1765def : InstRW<[Zn4PERMIT2_256], (instregex
1766	"VPERM(I2|T2)(PS|PD|W)Z256(rr|rrk|rrkz)",
1767	"VPERMP(S|D)Z256(rr|rrk|rrkz)",
1768	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z256(rr|rrk|rrkz)",
1769	"VPERM(B|D|Q|W)Z256(rr|rrk|rrkz)",
1770	"VPERM(I2|Q|T2)(B|D|Q)Z256(rr|rrk|rrkz)",
1771	"VPEXPAND(B|W)Z256(rr|rrk|rrkz)"
1772	)>;
1773
1774def Zn4PERMIT2Z: SchedWriteRes<[Zn4FPFMisc12]> {
1775  let Latency = 5;
1776  let ReleaseAtCycles = [2];
1777  let NumMicroOps = 1;
1778}
1779def : InstRW<[Zn4PERMIT2Z], (instregex
1780	"VPERM(I2|T2)(PS|PD|W)Z(rr|rrk|rrkz)",
1781	"VPERM(B|D|W)Z(rr|rrk|rrkz)",
1782	"VPERM(I2|Q|T2)(B|D|Q)Z(rr|rrk|rrkz)",
1783	"V(P?)COMPRESS(B|W|D|Q|PD|PS|SD|SQ)Z(rr|rrk|rrkz)",
1784	"VPEXPAND(B|W)Z(rr|rrk|rrkz)",
1785	"VPERMP(S|D)Z(rr|rrk|rrkz)"
1786	)>;
1787
1788// ALU SLOW Misc Instructions
1789def Zn4VecALUZSlow: SchedWriteRes<[Zn4FPFMisc01]> {
1790  let Latency = 2;
1791  let ReleaseAtCycles = [2];
1792  let NumMicroOps = 1;
1793}
1794def : InstRW<[Zn4VecALUZSlow], (instrs
1795	VPABSBZ128rr,      VPABSBZ128rrk,  VPABSBZ128rrkz,   VPABSDZ128rr,
1796	VPABSDZ128rrk,     VPABSDZ128rrkz, VPABSQZ128rr,     VPABSQZ128rrk,
1797	VPABSQZ128rrkz,    VPABSWZ128rr,   VPABSWZ128rrk,    VPABSWZ128rrkz,
1798	VPADDSBZ128rr,     VPADDSBZ128rrk, VPADDSBZ128rrkz,  VPADDSWZ128rr,
1799	VPADDSWZ128rrk,    VPADDSWZ128rrkz,VPADDUSBZ128rr,   VPADDUSBZ128rrk,
1800	VPADDUSBZ128rrkz,  VPADDUSWZ128rr, VPADDUSWZ128rrk,  VPADDUSWZ128rrkz,
1801	VPAVGBZ128rr,      VPAVGBZ128rrk,  VPAVGBZ128rrkz,   VPAVGWZ128rr,
1802	VPAVGWZ128rrk,     VPAVGWZ128rrkz, VPOPCNTBZ128rr,   VPOPCNTBZ128rrk,
1803	VPOPCNTBZ128rrkz,  VPOPCNTDZ128rr, VPOPCNTDZ128rrk,  VPOPCNTDZ128rrkz,
1804	VPOPCNTQZ128rr,    VPOPCNTQZ128rrk,VPOPCNTQZ128rrkz, VPOPCNTWZ128rr,
1805	VPOPCNTWZ128rrk,   VPOPCNTWZ128rrkz,VPSUBSBZ128rr,   VPSUBSBZ128rrk,
1806	VPSUBSBZ128rrkz,   VPSUBSWZ128rr,   VPSUBSWZ128rrk,  VPSUBSWZ128rrkz,
1807	VPSUBUSBZ128rr,    VPSUBUSBZ128rrk, VPSUBUSBZ128rrkz,VPSUBUSWZ128rr,
1808	VPSUBUSWZ128rrk,   VPSUBUSWZ128rrkz
1809	)>;
1810
1811
1812///////////////////////////////////////////////////////////////////////////////
1813// Dependency breaking instructions.
1814///////////////////////////////////////////////////////////////////////////////
1815
1816def Zn4WriteZeroIdiom : SchedWriteVariant<[
1817    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1818    SchedVar<NoSchedPred,                          [WriteALU]>
1819]>;
1820def : InstRW<[Zn4WriteZeroIdiom], (instrs XOR32rr, XOR32rr_REV,
1821                                          XOR64rr, XOR64rr_REV,
1822                                          SUB32rr, SUB32rr_REV,
1823                                          SUB64rr, SUB64rr_REV)>;
1824
1825def Zn4WriteZeroIdiomEFLAGS : SchedWriteVariant<[
1826    SchedVar<MCSchedPredicate<CheckSameRegOperand<0, 1>>, [Zn4WriteZeroLatency]>,
1827    SchedVar<NoSchedPred,                                 [WriteALU]>
1828]>;
1829def : InstRW<[Zn4WriteZeroIdiomEFLAGS], (instrs CMP8rr,  CMP8rr_REV,
1830                                                CMP16rr, CMP16rr_REV,
1831                                                CMP32rr, CMP32rr_REV,
1832                                                CMP64rr, CMP64rr_REV)>;
1833
1834def Zn4WriteFZeroIdiom : SchedWriteVariant<[
1835    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1836    SchedVar<NoSchedPred,                          [WriteFLogic]>
1837]>;
1838// NOTE: XORPSrr, XORPDrr are not zero-cycle!
1839def : InstRW<[Zn4WriteFZeroIdiom], (instrs VXORPSrr, VXORPDrr,
1840                                           VXORPSZ128rr,
1841                                           VXORPDZ128rr,
1842                                           VANDNPSrr, VANDNPDrr,
1843                                           VANDNPSZ128rr,
1844                                           VANDNPDZ128rr)>;
1845
1846def Zn4WriteFZeroIdiomY : SchedWriteVariant<[
1847    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1848    SchedVar<NoSchedPred,                          [WriteFLogicY]>
1849]>;
1850def : InstRW<[Zn4WriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr,
1851                                            VXORPSZ256rr,
1852                                            VXORPDZ256rr,
1853                                            VANDNPSYrr, VANDNPDYrr,
1854                                            VANDNPSZ256rr,
1855                                            VANDNPDZ256rr)>;
1856
1857def Zn4WriteFZeroIdiomZ : SchedWriteVariant<[
1858    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1859    SchedVar<NoSchedPred,                          [WriteFLogicZ]>
1860]>;
1861def : InstRW<[Zn4WriteFZeroIdiomZ], (instrs VXORPSZrr, VXORPDZrr,
1862                                            VANDNPSZrr, VANDNPDZrr)>;
1863
1864def Zn4WriteVZeroIdiomLogicX : SchedWriteVariant<[
1865    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1866    SchedVar<NoSchedPred,                          [WriteVecLogicX]>
1867]>;
1868// NOTE: PXORrr,PANDNrr are not zero-cycle!
1869def : InstRW<[Zn4WriteVZeroIdiomLogicX], (instrs VPXORrr,
1870                                                 VPXORDZ128rr,
1871                                                 VPXORQZ128rr,
1872                                                 VPANDNrr,
1873                                                 VPANDNDZ128rr,
1874                                                 VPANDNQZ128rr)>;
1875
1876def Zn4WriteVZeroIdiomLogicY : SchedWriteVariant<[
1877    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1878    SchedVar<NoSchedPred,                          [WriteVecLogicY]>
1879]>;
1880def : InstRW<[Zn4WriteVZeroIdiomLogicY], (instrs VPXORYrr,
1881                                                 VPXORDZ256rr,
1882                                                 VPXORQZ256rr,
1883                                                 VPANDNYrr,
1884                                                 VPANDNDZ256rr,
1885                                                 VPANDNQZ256rr)>;
1886
1887def Zn4WriteVZeroIdiomLogicZ : SchedWriteVariant<[
1888    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1889    SchedVar<NoSchedPred,                          [WriteVecLogicZ]>
1890]>;
1891def : InstRW<[Zn4WriteVZeroIdiomLogicZ], (instrs VPXORDZrr, VPXORQZrr,
1892                                                 VPANDNDZrr, VPANDNQZrr)>;
1893
1894def Zn4WriteVZeroIdiomALUX : SchedWriteVariant<[
1895    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1896    SchedVar<NoSchedPred,                          [WriteVecALUX]>
1897]>;
1898// NOTE: PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1899//       PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr are not zero-cycle!
1900def : InstRW<[Zn4WriteVZeroIdiomALUX],
1901             (instrs VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1902                     VPSUBBZ128rr, VPSUBWZ128rr, VPSUBDZ128rr, VPSUBQZ128rr,
1903                     VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1904                     VPCMPGTBZ128rr, VPCMPGTWZ128rr,
1905                     VPCMPGTDZ128rr, VPCMPGTQZ128rr)>;
1906
1907def Zn4WriteVZeroIdiomALUY : SchedWriteVariant<[
1908    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1909    SchedVar<NoSchedPred,                          [WriteVecALUY]>
1910]>;
1911def : InstRW<[Zn4WriteVZeroIdiomALUY],
1912             (instrs VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1913                     VPSUBBZ256rr, VPSUBWZ256rr, VPSUBDZ256rr, VPSUBQZ256rr,
1914                     VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr,
1915                     VPCMPGTBZ256rr, VPCMPGTWZ256rr,
1916                     VPCMPGTDZ256rr, VPCMPGTQZ256rr)>;
1917
1918def Zn4WriteVZeroIdiomALUZ : SchedWriteVariant<[
1919    SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [Zn4WriteZeroLatency]>,
1920    SchedVar<NoSchedPred,                          [WriteVecALUZ]>
1921]>;
1922def : InstRW<[Zn4WriteVZeroIdiomALUY],
1923             (instrs VPSUBBZrr, VPSUBWZrr, VPSUBDZrr, VPSUBQZrr,
1924                     VPCMPGTBZrr, VPCMPGTWZrr, VPCMPGTDZrr, VPCMPGTQZrr)>;
1925
1926def : IsZeroIdiomFunction<[
1927  // GPR Zero-idioms.
1928  DepBreakingClass<[ XOR32rr, XOR32rr_REV,
1929                     XOR64rr, XOR64rr_REV,
1930                     SUB32rr, SUB32rr_REV,
1931                     SUB64rr, SUB64rr_REV ], ZeroIdiomPredicate>,
1932
1933  // SSE XMM Zero-idioms.
1934  DepBreakingClass<[
1935    // fp variants.
1936    XORPSrr, XORPDrr,
1937    ANDNPSrr, ANDNPDrr,
1938
1939    // int variants.
1940    PXORrr,
1941    PANDNrr,
1942    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
1943    PSUBSBrr, PSUBSWrr,
1944    PSUBUSBrr, PSUBUSWrr,
1945    PCMPGTBrr, PCMPGTWrr, PCMPGTDrr, PCMPGTQrr
1946  ], ZeroIdiomPredicate>,
1947
1948  // AVX XMM Zero-idioms.
1949  DepBreakingClass<[
1950    // fp variants.
1951    VXORPSrr, VXORPDrr,
1952    VANDNPSrr, VANDNPDrr,
1953
1954    // int variants.
1955    VPXORrr,
1956    VPANDNrr,
1957    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
1958    VPSUBSBrr, VPSUBSWrr,
1959    VPSUBUSBrr, VPSUBUSWrr,
1960    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
1961  ], ZeroIdiomPredicate>,
1962
1963  // AVX YMM Zero-idioms.
1964  DepBreakingClass<[
1965    // fp variants.
1966    VXORPSYrr, VXORPDYrr,
1967    VANDNPSYrr, VANDNPDYrr,
1968
1969    // int variants.
1970    VPXORYrr,
1971    VPANDNYrr,
1972    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
1973    VPSUBSBYrr, VPSUBSWYrr,
1974    VPSUBUSBYrr, VPSUBUSWYrr,
1975    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
1976  ], ZeroIdiomPredicate>,
1977
1978  // AVX ZMM Zero-idioms.
1979  DepBreakingClass<[
1980    // fp variants.
1981    VXORPSZrr, VXORPDZrr,
1982    VXORPSZ128rr, VXORPDZ128rr, VXORPSZ256rr, VXORPDZ256rr,
1983    VANDNPSZrr, VANDNPDZrr,
1984    VANDNPSZ128rr, VANDNPDZ128rr, VANDNPSZ256rr, VANDNPDZ256rr,
1985
1986    // int variants.
1987    VPCMPGTBZrr, VPCMPGTWZrr, VPCMPGTDZrr, VPCMPGTQZrr,
1988    VPCMPGTBZ128rr, VPCMPGTWZ128rr, VPCMPGTDZ128rr, VPCMPGTQZ128rr,
1989    VPCMPGTBZ256rr, VPCMPGTWZ256rr, VPCMPGTDZ256rr, VPCMPGTQZ256rr,
1990    VPANDNDZrr, VPANDNQZrr,
1991    VPANDNDZ128rr, VPANDNQZ128rr, VPANDNDZ256rr, VPANDNQZ256rr,
1992    VPXORDZrr, VPXORQZrr,
1993    VPXORDZ128rr, VPXORQZ128rr, VPXORDZ256rr, VPXORQZ256rr,
1994    VPSUBBZrr, VPSUBWZrr, VPSUBDZrr, VPSUBQZrr,
1995    VPSUBBZ128rr, VPSUBWZ128rr, VPSUBDZ128rr, VPSUBQZ128rr,
1996    VPSUBBZ256rr, VPSUBWZ256rr, VPSUBDZ256rr, VPSUBQZ256rr,
1997  ], ZeroIdiomPredicate>,
1998]>;
1999
2000def : IsDepBreakingFunction<[
2001  // GPR
2002  DepBreakingClass<[ SBB32rr, SBB32rr_REV,
2003                     SBB64rr, SBB64rr_REV ], ZeroIdiomPredicate>,
2004  DepBreakingClass<[ CMP8rr,  CMP8rr_REV,
2005                     CMP16rr, CMP16rr_REV,
2006                     CMP32rr, CMP32rr_REV,
2007                     CMP64rr, CMP64rr_REV ], CheckSameRegOperand<0, 1> >,
2008  // SSE
2009  DepBreakingClass<[
2010    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
2011  ], ZeroIdiomPredicate>,
2012
2013  // AVX XMM
2014  DepBreakingClass<[
2015    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
2016  ], ZeroIdiomPredicate>,
2017
2018  // AVX YMM
2019  DepBreakingClass<[
2020    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
2021  ], ZeroIdiomPredicate>,
2022]>;
2023
2024} // SchedModel
2025
2026