xref: /dpdk/drivers/ml/cnxk/cn10k_ml_model.h (revision 8e4dd45c1c1de22225a4660eefd03c4ca593043b)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2022 Marvell.
3  */
4 
5 #ifndef _CN10K_ML_MODEL_H_
6 #define _CN10K_ML_MODEL_H_
7 
8 #include <rte_mldev.h>
9 
10 #include <roc_api.h>
11 
12 #include "cn10k_ml_ocm.h"
13 
14 #include "cnxk_ml_io.h"
15 
16 struct cnxk_ml_dev;
17 struct cnxk_ml_model;
18 struct cnxk_ml_layer;
19 struct cnxk_ml_req;
20 
21 /* Model Metadata : v 2.3.0.1 */
22 #define MRVL_ML_MODEL_MAGIC_STRING "MRVL"
23 #define MRVL_ML_MODEL_TARGET_ARCH  128
24 #define MRVL_ML_MODEL_VERSION_MIN  2100
25 #define MRVL_ML_MODEL_NAME_LEN	   64
26 #define MRVL_ML_INPUT_NAME_LEN	   16
27 #define MRVL_ML_OUTPUT_NAME_LEN	   16
28 #define MRVL_ML_NUM_INPUT_OUTPUT_1 8
29 #define MRVL_ML_NUM_INPUT_OUTPUT_2 24
30 #define MRVL_ML_NUM_INPUT_OUTPUT   (MRVL_ML_NUM_INPUT_OUTPUT_1 + MRVL_ML_NUM_INPUT_OUTPUT_2)
31 
32 /* Header (256-byte) */
33 struct cn10k_ml_model_metadata_header {
34 	/* Magic string ('M', 'R', 'V', 'L') */
35 	uint8_t magic[4];
36 
37 	/* Metadata version */
38 	uint8_t version[4];
39 
40 	/* Metadata size */
41 	uint32_t metadata_size;
42 
43 	/* Unique ID */
44 	uint8_t uuid[128];
45 
46 	/* Model target architecture
47 	 * 0 = Undefined
48 	 * 1 = M1K
49 	 * 128 = MLIP
50 	 * 256 = Experimental
51 	 */
52 	uint32_t target_architecture;
53 	uint8_t reserved[104];
54 
55 	/* CRC of data after header (i.e. after first 256 bytes) */
56 	uint32_t payload_crc32c;
57 
58 	/* CRC of first 252 bytes of header, after payload_crc calculation */
59 	uint32_t header_crc32c;
60 };
61 
62 /* Model information (256-byte) */
63 struct cn10k_ml_model_metadata_model {
64 	/* Model name string */
65 	uint8_t name[MRVL_ML_MODEL_NAME_LEN];
66 
67 	/* Model version info (xx.xx.xx.xx) */
68 	uint8_t version[4];
69 
70 	/* Model code size (Init + Main + Finish) */
71 	uint32_t code_size;
72 
73 	/* Model data size (Weights and Bias) */
74 	uint32_t data_size;
75 
76 	/* OCM start offset, set to ocm_wb_range_start */
77 	uint32_t ocm_start;
78 
79 	/* OCM start offset, set to max OCM size */
80 	uint32_t ocm_end;
81 
82 	/* Relocatable flag (always yes)
83 	 * 0 = Not relocatable
84 	 * 1 = Relocatable
85 	 */
86 	uint8_t ocm_relocatable;
87 
88 	/* Tile relocatable flag (always yes)
89 	 * 0 = Not relocatable
90 	 * 1 = Relocatable
91 	 */
92 	uint8_t tile_relocatable;
93 
94 	/* Start tile (Always 0) */
95 	uint8_t tile_start;
96 
97 	/* End tile (num_tiles - 1) */
98 	uint8_t tile_end;
99 
100 	/* Inference batch size */
101 	uint8_t batch_size;
102 
103 	/* Number of input tensors (Max 32) */
104 	uint8_t num_input;
105 
106 	/* Number of output tensors (Max 32) */
107 	uint8_t num_output;
108 	uint8_t reserved_1;
109 
110 	/* Total input size in bytes */
111 	uint32_t input_size;
112 
113 	/* Total output size in bytes */
114 	uint32_t output_size;
115 
116 	/* Table size in bytes */
117 	uint32_t table_size;
118 
119 	/* Number of layers in the network */
120 	uint32_t num_layers;
121 	uint32_t reserved_2;
122 
123 	/* Floor of absolute OCM region */
124 	uint64_t ocm_tmp_range_floor;
125 
126 	/* Relative OCM start address of WB data block */
127 	uint64_t ocm_wb_range_start;
128 
129 	/* Relative OCM end address of WB data block */
130 	uint64_t ocm_wb_range_end;
131 
132 	/* Relative DDR start address of WB data block */
133 	uint64_t ddr_wb_range_start;
134 
135 	/* Relative DDR end address of all outputs */
136 	uint64_t ddr_wb_range_end;
137 
138 	/* Relative DDR start address of all inputs */
139 	uint64_t ddr_input_range_start;
140 
141 	/* Relative DDR end address of all inputs */
142 	uint64_t ddr_input_range_end;
143 
144 	/* Relative DDR start address of all outputs */
145 	uint64_t ddr_output_range_start;
146 
147 	/* Relative DDR end address of all outputs */
148 	uint64_t ddr_output_range_end;
149 
150 	/* Compiler version */
151 	uint8_t compiler_version[8];
152 
153 	/* CDK version */
154 	uint8_t cdk_version[4];
155 
156 	/* Lower batch optimization support
157 	 * 0 - No,
158 	 * 1 - Yes
159 	 */
160 	uint8_t supports_lower_batch_size_optimization;
161 	uint8_t reserved_3[3];
162 
163 	/* Relative DDR start address of scratch space */
164 	uint64_t ddr_scratch_range_start;
165 
166 	/* Relative DDR end address of scratch space */
167 	uint64_t ddr_scratch_range_end;
168 	uint8_t reserved_4[40];
169 };
170 
171 /* Init section (64-byte) */
172 struct cn10k_ml_model_metadata_init_section {
173 	uint32_t file_offset;
174 	uint32_t file_size;
175 	uint8_t reserved[56];
176 };
177 
178 /* Main section (64-byte) */
179 struct cn10k_ml_model_metadata_main_section {
180 	uint32_t file_offset;
181 	uint32_t file_size;
182 	uint8_t reserved[56];
183 };
184 
185 /* Finish section (64-byte) */
186 struct cn10k_ml_model_metadata_finish_section {
187 	uint32_t file_offset;
188 	uint32_t file_size;
189 	uint8_t reserved[56];
190 };
191 
192 /* Weights and Bias (64-byte) */
193 struct cn10k_ml_model_metadata_weights_bias_section {
194 	/* Memory offset, set to ddr_wb_range_start */
195 	uint64_t mem_offset;
196 	uint32_t file_offset;
197 	uint32_t file_size;
198 
199 	/* Relocatable flag for WB
200 	 * 1 = Relocatable
201 	 * 2 = Not relocatable
202 	 */
203 	uint8_t relocatable;
204 	uint8_t reserved[47];
205 };
206 
207 /* Input section (64-byte per input) */
208 struct cn10k_ml_model_metadata_input_section {
209 	/* DDR offset (in OCM absolute addresses for input) */
210 	uint64_t mem_offset;
211 
212 	/* Relocatable flag
213 	 * 1 = Relocatable
214 	 * 2 = Not relocatable
215 	 */
216 	uint8_t relocatable;
217 
218 	/* Input quantization
219 	 * 1 = Requires quantization
220 	 * 2 = Pre-quantized
221 	 */
222 	uint8_t quantize;
223 
224 	/* Type of incoming input
225 	 * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16,
226 	 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
227 	 */
228 	uint8_t input_type;
229 
230 	/* Type of input required by model
231 	 * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16,
232 	 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
233 	 */
234 	uint8_t model_input_type;
235 
236 	/* float_32 qscale value
237 	 * quantized = non-quantized * qscale
238 	 */
239 	float qscale;
240 
241 	/* Input shape */
242 	struct {
243 		/* Input format
244 		 * 1 = NCHW
245 		 * 2 = NHWC
246 		 */
247 		uint8_t format;
248 		uint8_t reserved[3];
249 		uint32_t w;
250 		uint32_t x;
251 		uint32_t y;
252 		uint32_t z;
253 	} shape;
254 	uint8_t reserved[4];
255 
256 	/* Name of input */
257 	uint8_t input_name[MRVL_ML_INPUT_NAME_LEN];
258 
259 	/* DDR range end
260 	 * new = mem_offset + size_bytes - 1
261 	 */
262 	uint64_t ddr_range_end;
263 };
264 
265 /* Output section (64-byte per output) */
266 struct cn10k_ml_model_metadata_output_section {
267 	/* DDR offset in OCM absolute addresses for output */
268 	uint64_t mem_offset;
269 
270 	/* Relocatable flag
271 	 * 1 = Relocatable
272 	 * 2 = Not relocatable
273 	 */
274 	uint8_t relocatable;
275 
276 	/* Output dequantization
277 	 * 1 = De-quantization required
278 	 * 2 = De-quantization not required
279 	 */
280 	uint8_t dequantize;
281 
282 	/* Type of outgoing output
283 	 * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16
284 	 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
285 	 */
286 	uint8_t output_type;
287 
288 	/* Type of output produced by model
289 	 * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16
290 	 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32
291 	 */
292 	uint8_t model_output_type;
293 
294 	/* float_32 dscale value
295 	 * dequantized = quantized * dscale
296 	 */
297 	float dscale;
298 
299 	/* Number of items in the output */
300 	uint32_t size;
301 	uint8_t reserved[20];
302 
303 	/* DDR range end
304 	 * new = mem_offset + size_bytes - 1
305 	 */
306 	uint64_t ddr_range_end;
307 	uint8_t output_name[MRVL_ML_OUTPUT_NAME_LEN];
308 };
309 
310 /* Model data */
311 struct cn10k_ml_model_metadata_data_section {
312 	uint8_t reserved[996];
313 
314 	/* Beta: xx.xx.xx.xx,
315 	 * Later: YYYYMM.xx.xx
316 	 */
317 	uint8_t compiler_version[8];
318 
319 	/* M1K CDK version (xx.xx.xx.xx) */
320 	uint8_t m1k_cdk_version[4];
321 };
322 
323 /* Model file metadata structure */
324 struct cn10k_ml_model_metadata {
325 	/* Header (256-byte) */
326 	struct cn10k_ml_model_metadata_header header;
327 
328 	/* Model information (256-byte) */
329 	struct cn10k_ml_model_metadata_model model;
330 
331 	/* Init section (64-byte) */
332 	struct cn10k_ml_model_metadata_init_section init_model;
333 
334 	/* Main section (64-byte) */
335 	struct cn10k_ml_model_metadata_main_section main_model;
336 
337 	/* Finish section (64-byte) */
338 	struct cn10k_ml_model_metadata_finish_section finish_model;
339 
340 	uint8_t reserved_1[512]; /* End of 2k bytes */
341 
342 	/* Weights and Bias (64-byte) */
343 	struct cn10k_ml_model_metadata_weights_bias_section weights_bias;
344 
345 	/* Input (512-bytes, 64-byte per input) provisioned for 8 inputs */
346 	struct cn10k_ml_model_metadata_input_section input1[MRVL_ML_NUM_INPUT_OUTPUT_1];
347 
348 	/* Output (512-bytes, 64-byte per output) provisioned for 8 outputs */
349 	struct cn10k_ml_model_metadata_output_section output1[MRVL_ML_NUM_INPUT_OUTPUT_1];
350 
351 	uint8_t reserved_2[1792];
352 
353 	/* Input (1536-bytes, 64-byte per input) provisioned for 24 inputs */
354 	struct cn10k_ml_model_metadata_input_section input2[MRVL_ML_NUM_INPUT_OUTPUT_2];
355 
356 	/* Output (1536-bytes, 64-byte per output) provisioned for 24 outputs */
357 	struct cn10k_ml_model_metadata_output_section output2[MRVL_ML_NUM_INPUT_OUTPUT_2];
358 
359 	/* Model data */
360 	struct cn10k_ml_model_metadata_data_section data;
361 
362 	/* Hidden 16 bytes of magic code */
363 	uint8_t reserved_3[16];
364 };
365 
366 /* Model address structure */
367 struct cn10k_ml_layer_addr {
368 	/* Base DMA address for load */
369 	void *base_dma_addr_load;
370 
371 	/* Init section load address */
372 	void *init_load_addr;
373 
374 	/* Main section load address */
375 	void *main_load_addr;
376 
377 	/* Finish section load address */
378 	void *finish_load_addr;
379 
380 	/* Weights and Bias base address */
381 	void *wb_base_addr;
382 
383 	/* Weights and bias load address */
384 	void *wb_load_addr;
385 
386 	/* Scratch base address */
387 	void *scratch_base_addr;
388 
389 	/* Start tile */
390 	uint8_t tile_start;
391 
392 	/* End tile */
393 	uint8_t tile_end;
394 };
395 
396 /* Model fast-path stats */
397 struct cn10k_ml_layer_xstats {
398 	/* Total hardware latency, sum of all inferences */
399 	uint64_t hw_latency_tot;
400 
401 	/* Minimum hardware latency */
402 	uint64_t hw_latency_min;
403 
404 	/* Maximum hardware latency */
405 	uint64_t hw_latency_max;
406 
407 	/* Total firmware latency, sum of all inferences */
408 	uint64_t fw_latency_tot;
409 
410 	/* Minimum firmware latency */
411 	uint64_t fw_latency_min;
412 
413 	/* Maximum firmware latency */
414 	uint64_t fw_latency_max;
415 
416 	/* Total jobs dequeued */
417 	uint64_t dequeued_count;
418 
419 	/* Hardware stats reset index */
420 	uint64_t hw_reset_count;
421 
422 	/* Firmware stats reset index */
423 	uint64_t fw_reset_count;
424 };
425 
426 struct cn10k_ml_layer_data {
427 	/* Model / Layer: metadata */
428 	struct cn10k_ml_model_metadata metadata;
429 
430 	/* Layer: address structure */
431 	struct cn10k_ml_layer_addr addr;
432 
433 	/* Layer: Tile and memory information object */
434 	struct cn10k_ml_ocm_layer_map ocm_map;
435 
436 	/* Layer: Slow-path operations request pointer */
437 	struct cnxk_ml_req *req;
438 
439 	/* Layer: Stats for burst ops */
440 	struct cn10k_ml_layer_xstats *burst_xstats;
441 
442 	/* Layer: Stats for sync ops */
443 	struct cn10k_ml_layer_xstats *sync_xstats;
444 };
445 
446 struct cn10k_ml_model_data {
447 	/* Model / Layer: metadata */
448 	struct cn10k_ml_model_metadata metadata;
449 };
450 
451 int cn10k_ml_model_metadata_check(uint8_t *buffer, uint64_t size);
452 void cn10k_ml_model_metadata_update(struct cn10k_ml_model_metadata *metadata);
453 void cn10k_ml_layer_addr_update(struct cnxk_ml_layer *layer, uint8_t *buffer,
454 				uint8_t *base_dma_addr);
455 void cn10k_ml_layer_io_info_set(struct cnxk_ml_io_info *io_info,
456 				struct cn10k_ml_model_metadata *metadata);
457 struct cnxk_ml_io_info *cn10k_ml_model_io_info_get(struct cnxk_ml_model *model, uint16_t layer_id);
458 int cn10k_ml_model_ocm_pages_count(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer,
459 				   uint8_t *buffer, uint16_t *wb_pages, uint16_t *scratch_pages);
460 void cn10k_ml_model_info_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
461 			     struct cnxk_ml_io_info *io_info,
462 			     struct cn10k_ml_model_metadata *metadata);
463 void cn10k_ml_layer_print(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer, FILE *fp);
464 int cn10k_ml_model_get_layer_id(struct cnxk_ml_model *model, const char *layer_name,
465 				uint16_t *layer_id);
466 
467 #endif /* _CN10K_ML_MODEL_H_ */
468