1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) 2022 Marvell. 3 */ 4 5 #ifndef _CN10K_ML_MODEL_H_ 6 #define _CN10K_ML_MODEL_H_ 7 8 #include <rte_mldev.h> 9 10 #include <roc_api.h> 11 12 #include "cn10k_ml_ocm.h" 13 14 #include "cnxk_ml_io.h" 15 16 struct cnxk_ml_dev; 17 struct cnxk_ml_model; 18 struct cnxk_ml_layer; 19 struct cnxk_ml_req; 20 21 /* Model Metadata : v 2.3.0.1 */ 22 #define MRVL_ML_MODEL_MAGIC_STRING "MRVL" 23 #define MRVL_ML_MODEL_TARGET_ARCH 128 24 #define MRVL_ML_MODEL_VERSION_MIN 2100 25 #define MRVL_ML_MODEL_NAME_LEN 64 26 #define MRVL_ML_INPUT_NAME_LEN 16 27 #define MRVL_ML_OUTPUT_NAME_LEN 16 28 #define MRVL_ML_NUM_INPUT_OUTPUT_1 8 29 #define MRVL_ML_NUM_INPUT_OUTPUT_2 24 30 #define MRVL_ML_NUM_INPUT_OUTPUT (MRVL_ML_NUM_INPUT_OUTPUT_1 + MRVL_ML_NUM_INPUT_OUTPUT_2) 31 32 /* Header (256-byte) */ 33 struct cn10k_ml_model_metadata_header { 34 /* Magic string ('M', 'R', 'V', 'L') */ 35 uint8_t magic[4]; 36 37 /* Metadata version */ 38 uint8_t version[4]; 39 40 /* Metadata size */ 41 uint32_t metadata_size; 42 43 /* Unique ID */ 44 uint8_t uuid[128]; 45 46 /* Model target architecture 47 * 0 = Undefined 48 * 1 = M1K 49 * 128 = MLIP 50 * 256 = Experimental 51 */ 52 uint32_t target_architecture; 53 uint8_t reserved[104]; 54 55 /* CRC of data after header (i.e. after first 256 bytes) */ 56 uint32_t payload_crc32c; 57 58 /* CRC of first 252 bytes of header, after payload_crc calculation */ 59 uint32_t header_crc32c; 60 }; 61 62 /* Model information (256-byte) */ 63 struct cn10k_ml_model_metadata_model { 64 /* Model name string */ 65 uint8_t name[MRVL_ML_MODEL_NAME_LEN]; 66 67 /* Model version info (xx.xx.xx.xx) */ 68 uint8_t version[4]; 69 70 /* Model code size (Init + Main + Finish) */ 71 uint32_t code_size; 72 73 /* Model data size (Weights and Bias) */ 74 uint32_t data_size; 75 76 /* OCM start offset, set to ocm_wb_range_start */ 77 uint32_t ocm_start; 78 79 /* OCM start offset, set to max OCM size */ 80 uint32_t ocm_end; 81 82 /* Relocatable flag (always yes) 83 * 0 = Not relocatable 84 * 1 = Relocatable 85 */ 86 uint8_t ocm_relocatable; 87 88 /* Tile relocatable flag (always yes) 89 * 0 = Not relocatable 90 * 1 = Relocatable 91 */ 92 uint8_t tile_relocatable; 93 94 /* Start tile (Always 0) */ 95 uint8_t tile_start; 96 97 /* End tile (num_tiles - 1) */ 98 uint8_t tile_end; 99 100 /* Inference batch size */ 101 uint8_t batch_size; 102 103 /* Number of input tensors (Max 32) */ 104 uint8_t num_input; 105 106 /* Number of output tensors (Max 32) */ 107 uint8_t num_output; 108 uint8_t reserved_1; 109 110 /* Total input size in bytes */ 111 uint32_t input_size; 112 113 /* Total output size in bytes */ 114 uint32_t output_size; 115 116 /* Table size in bytes */ 117 uint32_t table_size; 118 119 /* Number of layers in the network */ 120 uint32_t num_layers; 121 uint32_t reserved_2; 122 123 /* Floor of absolute OCM region */ 124 uint64_t ocm_tmp_range_floor; 125 126 /* Relative OCM start address of WB data block */ 127 uint64_t ocm_wb_range_start; 128 129 /* Relative OCM end address of WB data block */ 130 uint64_t ocm_wb_range_end; 131 132 /* Relative DDR start address of WB data block */ 133 uint64_t ddr_wb_range_start; 134 135 /* Relative DDR end address of all outputs */ 136 uint64_t ddr_wb_range_end; 137 138 /* Relative DDR start address of all inputs */ 139 uint64_t ddr_input_range_start; 140 141 /* Relative DDR end address of all inputs */ 142 uint64_t ddr_input_range_end; 143 144 /* Relative DDR start address of all outputs */ 145 uint64_t ddr_output_range_start; 146 147 /* Relative DDR end address of all outputs */ 148 uint64_t ddr_output_range_end; 149 150 /* Compiler version */ 151 uint8_t compiler_version[8]; 152 153 /* CDK version */ 154 uint8_t cdk_version[4]; 155 156 /* Lower batch optimization support 157 * 0 - No, 158 * 1 - Yes 159 */ 160 uint8_t supports_lower_batch_size_optimization; 161 uint8_t reserved_3[3]; 162 163 /* Relative DDR start address of scratch space */ 164 uint64_t ddr_scratch_range_start; 165 166 /* Relative DDR end address of scratch space */ 167 uint64_t ddr_scratch_range_end; 168 uint8_t reserved_4[40]; 169 }; 170 171 /* Init section (64-byte) */ 172 struct cn10k_ml_model_metadata_init_section { 173 uint32_t file_offset; 174 uint32_t file_size; 175 uint8_t reserved[56]; 176 }; 177 178 /* Main section (64-byte) */ 179 struct cn10k_ml_model_metadata_main_section { 180 uint32_t file_offset; 181 uint32_t file_size; 182 uint8_t reserved[56]; 183 }; 184 185 /* Finish section (64-byte) */ 186 struct cn10k_ml_model_metadata_finish_section { 187 uint32_t file_offset; 188 uint32_t file_size; 189 uint8_t reserved[56]; 190 }; 191 192 /* Weights and Bias (64-byte) */ 193 struct cn10k_ml_model_metadata_weights_bias_section { 194 /* Memory offset, set to ddr_wb_range_start */ 195 uint64_t mem_offset; 196 uint32_t file_offset; 197 uint32_t file_size; 198 199 /* Relocatable flag for WB 200 * 1 = Relocatable 201 * 2 = Not relocatable 202 */ 203 uint8_t relocatable; 204 uint8_t reserved[47]; 205 }; 206 207 /* Input section (64-byte per input) */ 208 struct cn10k_ml_model_metadata_input_section { 209 /* DDR offset (in OCM absolute addresses for input) */ 210 uint64_t mem_offset; 211 212 /* Relocatable flag 213 * 1 = Relocatable 214 * 2 = Not relocatable 215 */ 216 uint8_t relocatable; 217 218 /* Input quantization 219 * 1 = Requires quantization 220 * 2 = Pre-quantized 221 */ 222 uint8_t quantize; 223 224 /* Type of incoming input 225 * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16, 226 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32 227 */ 228 uint8_t input_type; 229 230 /* Type of input required by model 231 * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16, 232 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32 233 */ 234 uint8_t model_input_type; 235 236 /* float_32 qscale value 237 * quantized = non-quantized * qscale 238 */ 239 float qscale; 240 241 /* Input shape */ 242 struct { 243 /* Input format 244 * 1 = NCHW 245 * 2 = NHWC 246 */ 247 uint8_t format; 248 uint8_t reserved[3]; 249 uint32_t w; 250 uint32_t x; 251 uint32_t y; 252 uint32_t z; 253 } shape; 254 uint8_t reserved[4]; 255 256 /* Name of input */ 257 uint8_t input_name[MRVL_ML_INPUT_NAME_LEN]; 258 259 /* DDR range end 260 * new = mem_offset + size_bytes - 1 261 */ 262 uint64_t ddr_range_end; 263 }; 264 265 /* Output section (64-byte per output) */ 266 struct cn10k_ml_model_metadata_output_section { 267 /* DDR offset in OCM absolute addresses for output */ 268 uint64_t mem_offset; 269 270 /* Relocatable flag 271 * 1 = Relocatable 272 * 2 = Not relocatable 273 */ 274 uint8_t relocatable; 275 276 /* Output dequantization 277 * 1 = De-quantization required 278 * 2 = De-quantization not required 279 */ 280 uint8_t dequantize; 281 282 /* Type of outgoing output 283 * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16 284 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32 285 */ 286 uint8_t output_type; 287 288 /* Type of output produced by model 289 * 1 = INT8, 2 = UINT8, 3 = INT16, 4 = UINT16 290 * 5 = INT32, 6 = UINT32, 7 = FP16, 8 = FP32 291 */ 292 uint8_t model_output_type; 293 294 /* float_32 dscale value 295 * dequantized = quantized * dscale 296 */ 297 float dscale; 298 299 /* Number of items in the output */ 300 uint32_t size; 301 uint8_t reserved[20]; 302 303 /* DDR range end 304 * new = mem_offset + size_bytes - 1 305 */ 306 uint64_t ddr_range_end; 307 uint8_t output_name[MRVL_ML_OUTPUT_NAME_LEN]; 308 }; 309 310 /* Model data */ 311 struct cn10k_ml_model_metadata_data_section { 312 uint8_t reserved[996]; 313 314 /* Beta: xx.xx.xx.xx, 315 * Later: YYYYMM.xx.xx 316 */ 317 uint8_t compiler_version[8]; 318 319 /* M1K CDK version (xx.xx.xx.xx) */ 320 uint8_t m1k_cdk_version[4]; 321 }; 322 323 /* Model file metadata structure */ 324 struct cn10k_ml_model_metadata { 325 /* Header (256-byte) */ 326 struct cn10k_ml_model_metadata_header header; 327 328 /* Model information (256-byte) */ 329 struct cn10k_ml_model_metadata_model model; 330 331 /* Init section (64-byte) */ 332 struct cn10k_ml_model_metadata_init_section init_model; 333 334 /* Main section (64-byte) */ 335 struct cn10k_ml_model_metadata_main_section main_model; 336 337 /* Finish section (64-byte) */ 338 struct cn10k_ml_model_metadata_finish_section finish_model; 339 340 uint8_t reserved_1[512]; /* End of 2k bytes */ 341 342 /* Weights and Bias (64-byte) */ 343 struct cn10k_ml_model_metadata_weights_bias_section weights_bias; 344 345 /* Input (512-bytes, 64-byte per input) provisioned for 8 inputs */ 346 struct cn10k_ml_model_metadata_input_section input1[MRVL_ML_NUM_INPUT_OUTPUT_1]; 347 348 /* Output (512-bytes, 64-byte per output) provisioned for 8 outputs */ 349 struct cn10k_ml_model_metadata_output_section output1[MRVL_ML_NUM_INPUT_OUTPUT_1]; 350 351 uint8_t reserved_2[1792]; 352 353 /* Input (1536-bytes, 64-byte per input) provisioned for 24 inputs */ 354 struct cn10k_ml_model_metadata_input_section input2[MRVL_ML_NUM_INPUT_OUTPUT_2]; 355 356 /* Output (1536-bytes, 64-byte per output) provisioned for 24 outputs */ 357 struct cn10k_ml_model_metadata_output_section output2[MRVL_ML_NUM_INPUT_OUTPUT_2]; 358 359 /* Model data */ 360 struct cn10k_ml_model_metadata_data_section data; 361 362 /* Hidden 16 bytes of magic code */ 363 uint8_t reserved_3[16]; 364 }; 365 366 /* Model address structure */ 367 struct cn10k_ml_layer_addr { 368 /* Base DMA address for load */ 369 void *base_dma_addr_load; 370 371 /* Init section load address */ 372 void *init_load_addr; 373 374 /* Main section load address */ 375 void *main_load_addr; 376 377 /* Finish section load address */ 378 void *finish_load_addr; 379 380 /* Weights and Bias base address */ 381 void *wb_base_addr; 382 383 /* Weights and bias load address */ 384 void *wb_load_addr; 385 386 /* Scratch base address */ 387 void *scratch_base_addr; 388 389 /* Start tile */ 390 uint8_t tile_start; 391 392 /* End tile */ 393 uint8_t tile_end; 394 }; 395 396 /* Model fast-path stats */ 397 struct cn10k_ml_layer_xstats { 398 /* Total hardware latency, sum of all inferences */ 399 uint64_t hw_latency_tot; 400 401 /* Minimum hardware latency */ 402 uint64_t hw_latency_min; 403 404 /* Maximum hardware latency */ 405 uint64_t hw_latency_max; 406 407 /* Total firmware latency, sum of all inferences */ 408 uint64_t fw_latency_tot; 409 410 /* Minimum firmware latency */ 411 uint64_t fw_latency_min; 412 413 /* Maximum firmware latency */ 414 uint64_t fw_latency_max; 415 416 /* Total jobs dequeued */ 417 uint64_t dequeued_count; 418 419 /* Hardware stats reset index */ 420 uint64_t hw_reset_count; 421 422 /* Firmware stats reset index */ 423 uint64_t fw_reset_count; 424 }; 425 426 struct cn10k_ml_layer_data { 427 /* Model / Layer: metadata */ 428 struct cn10k_ml_model_metadata metadata; 429 430 /* Layer: address structure */ 431 struct cn10k_ml_layer_addr addr; 432 433 /* Layer: Tile and memory information object */ 434 struct cn10k_ml_ocm_layer_map ocm_map; 435 436 /* Layer: Slow-path operations request pointer */ 437 struct cnxk_ml_req *req; 438 439 /* Layer: Stats for burst ops */ 440 struct cn10k_ml_layer_xstats *burst_xstats; 441 442 /* Layer: Stats for sync ops */ 443 struct cn10k_ml_layer_xstats *sync_xstats; 444 }; 445 446 struct cn10k_ml_model_data { 447 /* Model / Layer: metadata */ 448 struct cn10k_ml_model_metadata metadata; 449 }; 450 451 int cn10k_ml_model_metadata_check(uint8_t *buffer, uint64_t size); 452 void cn10k_ml_model_metadata_update(struct cn10k_ml_model_metadata *metadata); 453 void cn10k_ml_layer_addr_update(struct cnxk_ml_layer *layer, uint8_t *buffer, 454 uint8_t *base_dma_addr); 455 void cn10k_ml_layer_io_info_set(struct cnxk_ml_io_info *io_info, 456 struct cn10k_ml_model_metadata *metadata); 457 struct cnxk_ml_io_info *cn10k_ml_model_io_info_get(struct cnxk_ml_model *model, uint16_t layer_id); 458 int cn10k_ml_model_ocm_pages_count(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer, 459 uint8_t *buffer, uint16_t *wb_pages, uint16_t *scratch_pages); 460 void cn10k_ml_model_info_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model, 461 struct cnxk_ml_io_info *io_info, 462 struct cn10k_ml_model_metadata *metadata); 463 void cn10k_ml_layer_print(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_layer *layer, FILE *fp); 464 int cn10k_ml_model_get_layer_id(struct cnxk_ml_model *model, const char *layer_name, 465 uint16_t *layer_id); 466 467 #endif /* _CN10K_ML_MODEL_H_ */ 468