1 /* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11 #include "data.h" 12 13 #include <assert.h> 14 #include <errno.h> 15 #include <stdio.h> 16 #include <string.h> 17 #include <stdlib.h> /* free() */ 18 19 #include <sys/stat.h> 20 21 #include <curl/curl.h> 22 23 #include "mem.h" 24 #include "util.h" 25 #define XXH_STATIC_LINKING_ONLY 26 #include "xxhash.h" 27 28 /** 29 * Data objects 30 */ 31 32 #define REGRESSION_RELEASE(x) \ 33 "https://github.com/facebook/zstd/releases/download/regression-data/" x 34 35 data_t silesia = { 36 .name = "silesia", 37 .type = data_type_dir, 38 .data = 39 { 40 .url = REGRESSION_RELEASE("silesia.tar.zst"), 41 .xxhash64 = 0x48a199f92f93e977LL, 42 }, 43 }; 44 45 data_t silesia_tar = { 46 .name = "silesia.tar", 47 .type = data_type_file, 48 .data = 49 { 50 .url = REGRESSION_RELEASE("silesia.tar.zst"), 51 .xxhash64 = 0x48a199f92f93e977LL, 52 }, 53 }; 54 55 data_t github = { 56 .name = "github", 57 .type = data_type_dir, 58 .data = 59 { 60 .url = REGRESSION_RELEASE("github.tar.zst"), 61 .xxhash64 = 0xa9b1b44b020df292LL, 62 }, 63 .dict = 64 { 65 .url = REGRESSION_RELEASE("github.dict.zst"), 66 .xxhash64 = 0x1eddc6f737d3cb53LL, 67 68 }, 69 }; 70 71 data_t github_tar = { 72 .name = "github.tar", 73 .type = data_type_file, 74 .data = 75 { 76 .url = REGRESSION_RELEASE("github.tar.zst"), 77 .xxhash64 = 0xa9b1b44b020df292LL, 78 }, 79 .dict = 80 { 81 .url = REGRESSION_RELEASE("github.dict.zst"), 82 .xxhash64 = 0x1eddc6f737d3cb53LL, 83 84 }, 85 }; 86 87 static data_t* g_data[] = { 88 &silesia, 89 &silesia_tar, 90 &github, 91 &github_tar, 92 NULL, 93 }; 94 95 data_t const* const* data = (data_t const* const*)g_data; 96 97 /** 98 * data helpers. 99 */ 100 101 int data_has_dict(data_t const* data) { 102 return data->dict.url != NULL; 103 } 104 105 /** 106 * data buffer helper functions (documented in header). 107 */ 108 109 data_buffer_t data_buffer_create(size_t const capacity) { 110 data_buffer_t buffer = {}; 111 112 buffer.data = (uint8_t*)malloc(capacity); 113 if (buffer.data == NULL) 114 return buffer; 115 buffer.capacity = capacity; 116 return buffer; 117 } 118 119 data_buffer_t data_buffer_read(char const* filename) { 120 data_buffer_t buffer = {}; 121 122 uint64_t const size = UTIL_getFileSize(filename); 123 if (size == UTIL_FILESIZE_UNKNOWN) { 124 fprintf(stderr, "unknown size for %s\n", filename); 125 return buffer; 126 } 127 128 buffer.data = (uint8_t*)malloc(size); 129 if (buffer.data == NULL) { 130 fprintf(stderr, "malloc failed\n"); 131 return buffer; 132 } 133 buffer.capacity = size; 134 135 FILE* file = fopen(filename, "rb"); 136 if (file == NULL) { 137 fprintf(stderr, "file null\n"); 138 goto err; 139 } 140 buffer.size = fread(buffer.data, 1, buffer.capacity, file); 141 fclose(file); 142 if (buffer.size != buffer.capacity) { 143 fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity); 144 goto err; 145 } 146 147 return buffer; 148 err: 149 free(buffer.data); 150 memset(&buffer, 0, sizeof(buffer)); 151 return buffer; 152 } 153 154 data_buffer_t data_buffer_get_data(data_t const* data) { 155 data_buffer_t const kEmptyBuffer = {}; 156 157 if (data->type != data_type_file) 158 return kEmptyBuffer; 159 160 return data_buffer_read(data->data.path); 161 } 162 163 data_buffer_t data_buffer_get_dict(data_t const* data) { 164 data_buffer_t const kEmptyBuffer = {}; 165 166 if (!data_has_dict(data)) 167 return kEmptyBuffer; 168 169 return data_buffer_read(data->dict.path); 170 } 171 172 int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) { 173 size_t const size = 174 buffer1.size < buffer2.size ? buffer1.size : buffer2.size; 175 int const cmp = memcmp(buffer1.data, buffer2.data, size); 176 if (cmp != 0) 177 return cmp; 178 if (buffer1.size < buffer2.size) 179 return -1; 180 if (buffer1.size == buffer2.size) 181 return 0; 182 assert(buffer1.size > buffer2.size); 183 return 1; 184 } 185 186 void data_buffer_free(data_buffer_t buffer) { 187 free(buffer.data); 188 } 189 190 /** 191 * data filenames helpers. 192 */ 193 194 FileNamesTable* data_filenames_get(data_t const* data) 195 { 196 char const* const path = data->data.path; 197 return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ ); 198 } 199 200 /** 201 * data buffers helpers. 202 */ 203 204 data_buffers_t data_buffers_get(data_t const* data) { 205 data_buffers_t buffers = {.size = 0}; 206 FileNamesTable* const filenames = data_filenames_get(data); 207 if (filenames == NULL) return buffers; 208 if (filenames->tableSize == 0) { 209 UTIL_freeFileNamesTable(filenames); 210 return buffers; 211 } 212 213 data_buffer_t* buffersPtr = 214 (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr)); 215 if (buffersPtr == NULL) { 216 UTIL_freeFileNamesTable(filenames); 217 return buffers; 218 } 219 buffers.buffers = (data_buffer_t const*)buffersPtr; 220 buffers.size = filenames->tableSize; 221 222 for (size_t i = 0; i < filenames->tableSize; ++i) { 223 buffersPtr[i] = data_buffer_read(filenames->fileNames[i]); 224 if (buffersPtr[i].data == NULL) { 225 data_buffers_t const kEmptyBuffer = {}; 226 data_buffers_free(buffers); 227 UTIL_freeFileNamesTable(filenames); 228 return kEmptyBuffer; 229 } 230 } 231 232 UTIL_freeFileNamesTable(filenames); 233 return buffers; 234 } 235 236 /** 237 * Frees the data buffers. 238 */ 239 void data_buffers_free(data_buffers_t buffers) { 240 free((data_buffer_t*)buffers.buffers); 241 } 242 243 /** 244 * Initialization and download functions. 245 */ 246 247 static char* g_data_dir = NULL; 248 249 /* mkdir -p */ 250 static int ensure_directory_exists(char const* indir) { 251 char* const dir = strdup(indir); 252 char* end = dir; 253 int ret = 0; 254 if (dir == NULL) { 255 ret = EINVAL; 256 goto out; 257 } 258 do { 259 /* Find the next directory level. */ 260 for (++end; *end != '\0' && *end != '/'; ++end) 261 ; 262 /* End the string there, make the directory, and restore the string. */ 263 char const save = *end; 264 *end = '\0'; 265 int const isdir = UTIL_isDirectory(dir); 266 ret = mkdir(dir, S_IRWXU); 267 *end = save; 268 /* Its okay if the directory already exists. */ 269 if (ret == 0 || (errno == EEXIST && isdir)) 270 continue; 271 ret = errno; 272 fprintf(stderr, "mkdir() failed\n"); 273 goto out; 274 } while (*end != '\0'); 275 276 ret = 0; 277 out: 278 free(dir); 279 return ret; 280 } 281 282 /** Concatenate 3 strings into a new buffer. */ 283 static char* cat3(char const* str1, char const* str2, char const* str3) { 284 size_t const size1 = strlen(str1); 285 size_t const size2 = strlen(str2); 286 size_t const size3 = str3 == NULL ? 0 : strlen(str3); 287 size_t const size = size1 + size2 + size3 + 1; 288 char* const dst = (char*)malloc(size); 289 if (dst == NULL) 290 return NULL; 291 strcpy(dst, str1); 292 strcpy(dst + size1, str2); 293 if (str3 != NULL) 294 strcpy(dst + size1 + size2, str3); 295 assert(strlen(dst) == size1 + size2 + size3); 296 return dst; 297 } 298 299 static char* cat2(char const* str1, char const* str2) { 300 return cat3(str1, str2, NULL); 301 } 302 303 /** 304 * State needed by the curl callback. 305 * It takes data from curl, hashes it, and writes it to the file. 306 */ 307 typedef struct { 308 FILE* file; 309 XXH64_state_t xxhash64; 310 int error; 311 } curl_data_t; 312 313 /** Create the curl state. */ 314 static curl_data_t curl_data_create( 315 data_resource_t const* resource, 316 data_type_t type) { 317 curl_data_t cdata = {}; 318 319 XXH64_reset(&cdata.xxhash64, 0); 320 321 assert(UTIL_isDirectory(g_data_dir)); 322 323 if (type == data_type_file) { 324 /* Decompress the resource and store to the path. */ 325 char* cmd = cat3("zstd -dqfo '", resource->path, "'"); 326 if (cmd == NULL) { 327 cdata.error = ENOMEM; 328 return cdata; 329 } 330 cdata.file = popen(cmd, "w"); 331 free(cmd); 332 } else { 333 /* Decompress and extract the resource to the cache directory. */ 334 char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'"); 335 if (cmd == NULL) { 336 cdata.error = ENOMEM; 337 return cdata; 338 } 339 cdata.file = popen(cmd, "w"); 340 free(cmd); 341 } 342 if (cdata.file == NULL) { 343 cdata.error = errno; 344 } 345 346 return cdata; 347 } 348 349 /** Free the curl state. */ 350 static int curl_data_free(curl_data_t cdata) { 351 return pclose(cdata.file); 352 } 353 354 /** curl callback. Updates the hash, and writes to the file. */ 355 static size_t curl_write(void* data, size_t size, size_t count, void* ptr) { 356 curl_data_t* cdata = (curl_data_t*)ptr; 357 size_t const written = fwrite(data, size, count, cdata->file); 358 XXH64_update(&cdata->xxhash64, data, written * size); 359 return written; 360 } 361 362 static int curl_download_resource( 363 CURL* curl, 364 data_resource_t const* resource, 365 data_type_t type) { 366 curl_data_t cdata; 367 /* Download the data. */ 368 if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0) 369 return EINVAL; 370 if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0) 371 return EINVAL; 372 cdata = curl_data_create(resource, type); 373 if (cdata.error != 0) 374 return cdata.error; 375 int const curl_err = curl_easy_perform(curl); 376 int const close_err = curl_data_free(cdata); 377 if (curl_err) { 378 fprintf( 379 stderr, 380 "downloading '%s' for '%s' failed\n", 381 resource->url, 382 resource->path); 383 return EIO; 384 } 385 if (close_err) { 386 fprintf(stderr, "writing data to '%s' failed\n", resource->path); 387 return EIO; 388 } 389 /* check that the file exists. */ 390 if (type == data_type_file && !UTIL_isRegularFile(resource->path)) { 391 fprintf(stderr, "output file '%s' does not exist\n", resource->path); 392 return EIO; 393 } 394 if (type == data_type_dir && !UTIL_isDirectory(resource->path)) { 395 fprintf( 396 stderr, "output directory '%s' does not exist\n", resource->path); 397 return EIO; 398 } 399 /* Check that the hash matches. */ 400 if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) { 401 fprintf( 402 stderr, 403 "checksum does not match: 0x%llxLL != 0x%llxLL\n", 404 (unsigned long long)XXH64_digest(&cdata.xxhash64), 405 (unsigned long long)resource->xxhash64); 406 return EINVAL; 407 } 408 409 return 0; 410 } 411 412 /** Download a single data object. */ 413 static int curl_download_datum(CURL* curl, data_t const* data) { 414 int ret; 415 ret = curl_download_resource(curl, &data->data, data->type); 416 if (ret != 0) 417 return ret; 418 if (data_has_dict(data)) { 419 ret = curl_download_resource(curl, &data->dict, data_type_file); 420 if (ret != 0) 421 return ret; 422 } 423 return ret; 424 } 425 426 /** Download all the data. */ 427 static int curl_download_data(data_t const* const* data) { 428 if (curl_global_init(CURL_GLOBAL_ALL) != 0) 429 return EFAULT; 430 431 curl_data_t cdata = {}; 432 CURL* curl = curl_easy_init(); 433 int err = EFAULT; 434 435 if (curl == NULL) 436 return EFAULT; 437 438 if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0) 439 goto out; 440 if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0) 441 goto out; 442 if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0) 443 goto out; 444 445 assert(data != NULL); 446 for (; *data != NULL; ++data) { 447 if (curl_download_datum(curl, *data) != 0) 448 goto out; 449 } 450 451 err = 0; 452 out: 453 curl_easy_cleanup(curl); 454 curl_global_cleanup(); 455 return err; 456 } 457 458 /** Fill the path member variable of the data objects. */ 459 static int data_create_paths(data_t* const* data, char const* dir) { 460 size_t const dirlen = strlen(dir); 461 assert(data != NULL); 462 for (; *data != NULL; ++data) { 463 data_t* const datum = *data; 464 datum->data.path = cat3(dir, "/", datum->name); 465 if (datum->data.path == NULL) 466 return ENOMEM; 467 if (data_has_dict(datum)) { 468 datum->dict.path = cat2(datum->data.path, ".dict"); 469 if (datum->dict.path == NULL) 470 return ENOMEM; 471 } 472 } 473 return 0; 474 } 475 476 /** Free the path member variable of the data objects. */ 477 static void data_free_paths(data_t* const* data) { 478 assert(data != NULL); 479 for (; *data != NULL; ++data) { 480 data_t* datum = *data; 481 free((void*)datum->data.path); 482 free((void*)datum->dict.path); 483 datum->data.path = NULL; 484 datum->dict.path = NULL; 485 } 486 } 487 488 static char const kStampName[] = "STAMP"; 489 490 static void xxh_update_le(XXH64_state_t* state, uint64_t data) { 491 if (!MEM_isLittleEndian()) 492 data = MEM_swap64(data); 493 XXH64_update(state, &data, sizeof(data)); 494 } 495 496 /** Hash the data to create the stamp. */ 497 static uint64_t stamp_hash(data_t const* const* data) { 498 XXH64_state_t state; 499 500 XXH64_reset(&state, 0); 501 assert(data != NULL); 502 for (; *data != NULL; ++data) { 503 data_t const* datum = *data; 504 /* We don't care about the URL that we fetch from. */ 505 /* The path is derived from the name. */ 506 XXH64_update(&state, datum->name, strlen(datum->name)); 507 xxh_update_le(&state, datum->data.xxhash64); 508 xxh_update_le(&state, datum->dict.xxhash64); 509 xxh_update_le(&state, datum->type); 510 } 511 return XXH64_digest(&state); 512 } 513 514 /** Check if the stamp matches the stamp in the cache directory. */ 515 static int stamp_check(char const* dir, data_t const* const* data) { 516 char* stamp = cat3(dir, "/", kStampName); 517 uint64_t const expected = stamp_hash(data); 518 XXH64_canonical_t actual; 519 FILE* stampfile = NULL; 520 int matches = 0; 521 522 if (stamp == NULL) 523 goto out; 524 if (!UTIL_isRegularFile(stamp)) { 525 fprintf(stderr, "stamp does not exist: recreating the data cache\n"); 526 goto out; 527 } 528 529 stampfile = fopen(stamp, "rb"); 530 if (stampfile == NULL) { 531 fprintf(stderr, "could not open stamp: recreating the data cache\n"); 532 goto out; 533 } 534 535 size_t b; 536 if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) { 537 fprintf(stderr, "invalid stamp: recreating the data cache\n"); 538 goto out; 539 } 540 541 matches = (expected == XXH64_hashFromCanonical(&actual)); 542 if (matches) 543 fprintf(stderr, "stamp matches: reusing the cached data\n"); 544 else 545 fprintf(stderr, "stamp does not match: recreating the data cache\n"); 546 547 out: 548 free(stamp); 549 if (stampfile != NULL) 550 fclose(stampfile); 551 return matches; 552 } 553 554 /** On success write a new stamp, on failure delete the old stamp. */ 555 static int 556 stamp_write(char const* dir, data_t const* const* data, int const data_err) { 557 char* stamp = cat3(dir, "/", kStampName); 558 FILE* stampfile = NULL; 559 int err = EIO; 560 561 if (stamp == NULL) 562 return ENOMEM; 563 564 if (data_err != 0) { 565 err = data_err; 566 goto out; 567 } 568 XXH64_canonical_t hash; 569 570 XXH64_canonicalFromHash(&hash, stamp_hash(data)); 571 572 stampfile = fopen(stamp, "wb"); 573 if (stampfile == NULL) 574 goto out; 575 if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1) 576 goto out; 577 err = 0; 578 fprintf(stderr, "stamped new data cache\n"); 579 out: 580 if (err != 0) 581 /* Ignore errors. */ 582 unlink(stamp); 583 free(stamp); 584 if (stampfile != NULL) 585 fclose(stampfile); 586 return err; 587 } 588 589 int data_init(char const* dir) { 590 int err; 591 592 if (dir == NULL) 593 return EINVAL; 594 595 /* This must be first to simplify logic. */ 596 err = ensure_directory_exists(dir); 597 if (err != 0) 598 return err; 599 600 /* Save the cache directory. */ 601 g_data_dir = strdup(dir); 602 if (g_data_dir == NULL) 603 return ENOMEM; 604 605 err = data_create_paths(g_data, dir); 606 if (err != 0) 607 return err; 608 609 /* If the stamp matches then we are good to go. 610 * This must be called before any modifications to the data cache. 611 * After this point, we MUST call stamp_write() to update the STAMP, 612 * since we've updated the data cache. 613 */ 614 if (stamp_check(dir, data)) 615 return 0; 616 617 err = curl_download_data(data); 618 if (err != 0) 619 goto out; 620 621 out: 622 /* This must be last, since it must know if data_init() succeeded. */ 623 stamp_write(dir, data, err); 624 return err; 625 } 626 627 void data_finish(void) { 628 data_free_paths(g_data); 629 free(g_data_dir); 630 g_data_dir = NULL; 631 } 632