1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Portions Copyright 2008 Denis Cheng 26 */ 27 28 #include "config.h" 29 #include "filebench.h" 30 #include "flowop.h" 31 #include "threadflow.h" /* For aiolist definition */ 32 33 #ifndef HAVE_OFF64_T 34 /* 35 * We are probably on linux. 36 * According to http://www.suse.de/~aj/linux_lfs.html, defining the 37 * above, automatically changes type of off_t to off64_t. so let 38 * us use only off_t as off64_t is not defined 39 */ 40 #defineoff64_t off_t 41 #endif /* HAVE_OFF64_T */ 42 43 #include <fcntl.h> 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <unistd.h> 47 #include <libgen.h> 48 #include <sys/mman.h> 49 #include <sys/stat.h> 50 #include <sys/types.h> 51 #include <sys/param.h> 52 #include <sys/resource.h> 53 54 #include "filebench.h" 55 #include "fsplug.h" 56 57 #ifdef HAVE_AIO 58 #include <aio.h> 59 #endif /* HAVE_AIO */ 60 61 #ifdef HAVE_LIBAIO_H 62 #include <libaio.h> 63 #endif /* HAVE_LIBAIO_H */ 64 65 #ifndef HAVE_AIOCB64_T 66 #define aiocb64 aiocb 67 #endif /* HAVE_AIOCB64_T */ 68 69 /* 70 * These routines implement local file access. They are placed into a 71 * vector of functions that are called by all I/O operations in fileset.c 72 * and flowop_library.c. This represents the default file system plug-in, 73 * and may be replaced by vectors for other file system plug-ins. 74 */ 75 76 static int fb_lfs_freemem(fb_fdesc_t *fd, off64_t size); 77 static int fb_lfs_open(fb_fdesc_t *, char *, int, int); 78 static int fb_lfs_pread(fb_fdesc_t *, caddr_t, fbint_t, off64_t); 79 static int fb_lfs_read(fb_fdesc_t *, caddr_t, fbint_t); 80 static int fb_lfs_pwrite(fb_fdesc_t *, caddr_t, fbint_t, off64_t); 81 static int fb_lfs_write(fb_fdesc_t *, caddr_t, fbint_t); 82 static int fb_lfs_lseek(fb_fdesc_t *, off64_t, int); 83 static int fb_lfs_truncate(fb_fdesc_t *, off64_t); 84 static int fb_lfs_rename(const char *, const char *); 85 static int fb_lfs_close(fb_fdesc_t *); 86 static int fb_lfs_link(const char *, const char *); 87 static int fb_lfs_symlink(const char *, const char *); 88 static int fb_lfs_unlink(char *); 89 static ssize_t fb_lfs_readlink(const char *, char *, size_t); 90 static int fb_lfs_mkdir(char *, int); 91 static int fb_lfs_rmdir(char *); 92 static DIR *fb_lfs_opendir(char *); 93 static struct dirent *fb_lfs_readdir(DIR *); 94 static int fb_lfs_closedir(DIR *); 95 static int fb_lfs_fsync(fb_fdesc_t *); 96 static int fb_lfs_stat(char *, struct stat64 *); 97 static int fb_lfs_fstat(fb_fdesc_t *, struct stat64 *); 98 static int fb_lfs_access(const char *, int); 99 100 static fsplug_func_t fb_lfs_funcs = 101 { 102 "locfs", 103 fb_lfs_freemem, /* flush page cache */ 104 fb_lfs_open, /* open */ 105 fb_lfs_pread, /* pread */ 106 fb_lfs_read, /* read */ 107 fb_lfs_pwrite, /* pwrite */ 108 fb_lfs_write, /* write */ 109 fb_lfs_lseek, /* lseek */ 110 fb_lfs_truncate, /* ftruncate */ 111 fb_lfs_rename, /* rename */ 112 fb_lfs_close, /* close */ 113 fb_lfs_link, /* link */ 114 fb_lfs_symlink, /* symlink */ 115 fb_lfs_unlink, /* unlink */ 116 fb_lfs_readlink, /* readlink */ 117 fb_lfs_mkdir, /* mkdir */ 118 fb_lfs_rmdir, /* rmdir */ 119 fb_lfs_opendir, /* opendir */ 120 fb_lfs_readdir, /* readdir */ 121 fb_lfs_closedir, /* closedir */ 122 fb_lfs_fsync, /* fsync */ 123 fb_lfs_stat, /* stat */ 124 fb_lfs_fstat, /* fstat */ 125 fb_lfs_access /* access */ 126 }; 127 128 #ifdef HAVE_AIO 129 /* 130 * Local file system asynchronous IO flowops are in this module, as 131 * they have a number of local file system specific features. 132 */ 133 static int fb_lfsflow_aiowrite(threadflow_t *threadflow, flowop_t *flowop); 134 static int fb_lfsflow_aiowait(threadflow_t *threadflow, flowop_t *flowop); 135 136 static flowop_proto_t fb_lfsflow_funcs[] = { 137 FLOW_TYPE_AIO, FLOW_ATTR_WRITE, "aiowrite", flowop_init_generic, 138 fb_lfsflow_aiowrite, flowop_destruct_generic, 139 FLOW_TYPE_AIO, 0, "aiowait", flowop_init_generic, 140 fb_lfsflow_aiowait, flowop_destruct_generic 141 }; 142 143 #endif /* HAVE_AIO */ 144 145 /* 146 * Initialize this processes I/O functions vector to point to 147 * the vector of local file system I/O functions 148 */ 149 void 150 fb_lfs_funcvecinit(void) 151 { 152 fs_functions_vec = &fb_lfs_funcs; 153 } 154 155 /* 156 * Initialize those flowops whose implementation is file system 157 * specific. 158 */ 159 void 160 fb_lfs_flowinit(void) 161 { 162 int nops; 163 164 /* 165 * re-initialize the I/O functions vector while we are at 166 * it as it may have been redefined since the process was 167 * created, at least if this is the master processes 168 */ 169 fb_lfs_funcvecinit(); 170 171 #ifdef HAVE_AIO 172 nops = sizeof (fb_lfsflow_funcs) / sizeof (flowop_proto_t); 173 flowop_flow_init(fb_lfsflow_funcs, nops); 174 #endif /* HAVE_AIO */ 175 } 176 177 /* 178 * Frees up memory mapped file region of supplied size. The 179 * file descriptor "fd" indicates which memory mapped file. 180 * If successful, returns 0. Otherwise returns -1 if "size" 181 * is zero, or -1 times the number of times msync() failed. 182 */ 183 static int 184 fb_lfs_freemem(fb_fdesc_t *fd, off64_t size) 185 { 186 off64_t left; 187 int ret = 0; 188 189 for (left = size; left > 0; left -= MMAP_SIZE) { 190 off64_t thismapsize; 191 caddr_t addr; 192 193 thismapsize = MIN(MMAP_SIZE, left); 194 addr = mmap64(0, thismapsize, PROT_READ|PROT_WRITE, 195 MAP_SHARED, fd->fd_num, size - left); 196 ret += msync(addr, thismapsize, MS_INVALIDATE); 197 (void) munmap(addr, thismapsize); 198 } 199 return (ret); 200 } 201 202 /* 203 * Does a posix pread. Returns what the pread() returns. 204 */ 205 static int 206 fb_lfs_pread(fb_fdesc_t *fd, caddr_t iobuf, fbint_t iosize, off64_t fileoffset) 207 { 208 return (pread64(fd->fd_num, iobuf, iosize, fileoffset)); 209 } 210 211 /* 212 * Does a posix read. Returns what the read() returns. 213 */ 214 static int 215 fb_lfs_read(fb_fdesc_t *fd, caddr_t iobuf, fbint_t iosize) 216 { 217 return (read(fd->fd_num, iobuf, iosize)); 218 } 219 220 #ifdef HAVE_AIO 221 222 /* 223 * Asynchronous write section. An Asynchronous IO element 224 * (aiolist_t) is used to associate the asynchronous write request with 225 * its subsequent completion. This element includes a aiocb64 struct 226 * that is used by posix aio_xxx calls to track the asynchronous writes. 227 * The flowops aiowrite and aiowait result in calls to these posix 228 * aio_xxx system routines to do the actual asynchronous write IO 229 * operations. 230 */ 231 232 233 /* 234 * Allocates an asynchronous I/O list (aio, of type 235 * aiolist_t) element. Adds it to the flowop thread's 236 * threadflow aio list. Returns a pointer to the element. 237 */ 238 static aiolist_t * 239 aio_allocate(flowop_t *flowop) 240 { 241 aiolist_t *aiolist; 242 243 if ((aiolist = malloc(sizeof (aiolist_t))) == NULL) { 244 filebench_log(LOG_ERROR, "malloc aiolist failed"); 245 filebench_shutdown(1); 246 } 247 248 /* Add to list */ 249 if (flowop->fo_thread->tf_aiolist == NULL) { 250 flowop->fo_thread->tf_aiolist = aiolist; 251 aiolist->al_next = NULL; 252 } else { 253 aiolist->al_next = flowop->fo_thread->tf_aiolist; 254 flowop->fo_thread->tf_aiolist = aiolist; 255 } 256 return (aiolist); 257 } 258 259 /* 260 * Searches for the aiolist element that has a matching 261 * completion block, aiocb. If none found returns FILEBENCH_ERROR. If 262 * found, removes the aiolist element from flowop thread's 263 * list and returns FILEBENCH_OK. 264 */ 265 static int 266 aio_deallocate(flowop_t *flowop, struct aiocb64 *aiocb) 267 { 268 aiolist_t *aiolist = flowop->fo_thread->tf_aiolist; 269 aiolist_t *previous = NULL; 270 aiolist_t *match = NULL; 271 272 if (aiocb == NULL) { 273 filebench_log(LOG_ERROR, "null aiocb deallocate"); 274 return (FILEBENCH_OK); 275 } 276 277 while (aiolist) { 278 if (aiocb == &(aiolist->al_aiocb)) { 279 match = aiolist; 280 break; 281 } 282 previous = aiolist; 283 aiolist = aiolist->al_next; 284 } 285 286 if (match == NULL) 287 return (FILEBENCH_ERROR); 288 289 /* Remove from the list */ 290 if (previous) 291 previous->al_next = match->al_next; 292 else 293 flowop->fo_thread->tf_aiolist = match->al_next; 294 295 return (FILEBENCH_OK); 296 } 297 298 /* 299 * Emulate posix aiowrite(). Determines which file to use, 300 * either one file of a fileset, or the file associated 301 * with a fileobj, allocates and fills an aiolist_t element 302 * for the write, and issues the asynchronous write. This 303 * operation is only valid for random IO, and returns an 304 * error if the flowop is set for sequential IO. Returns 305 * FILEBENCH_OK on success, FILEBENCH_NORSC if iosetup can't 306 * obtain a file to open, and FILEBENCH_ERROR on any 307 * encountered error. 308 */ 309 static int 310 fb_lfsflow_aiowrite(threadflow_t *threadflow, flowop_t *flowop) 311 { 312 caddr_t iobuf; 313 fbint_t wss; 314 fbint_t iosize; 315 fb_fdesc_t *fdesc; 316 int ret; 317 318 iosize = avd_get_int(flowop->fo_iosize); 319 320 if ((ret = flowoplib_iosetup(threadflow, flowop, &wss, &iobuf, 321 &fdesc, iosize)) != FILEBENCH_OK) 322 return (ret); 323 324 if (avd_get_bool(flowop->fo_random)) { 325 uint64_t fileoffset; 326 struct aiocb64 *aiocb; 327 aiolist_t *aiolist; 328 329 if (filebench_randomno64(&fileoffset, 330 wss, iosize, NULL) == -1) { 331 filebench_log(LOG_ERROR, 332 "file size smaller than IO size for thread %s", 333 flowop->fo_name); 334 return (FILEBENCH_ERROR); 335 } 336 337 aiolist = aio_allocate(flowop); 338 aiolist->al_type = AL_WRITE; 339 aiocb = &aiolist->al_aiocb; 340 341 aiocb->aio_fildes = fdesc->fd_num; 342 aiocb->aio_buf = iobuf; 343 aiocb->aio_nbytes = (size_t)iosize; 344 aiocb->aio_offset = (off64_t)fileoffset; 345 aiocb->aio_reqprio = 0; 346 347 filebench_log(LOG_DEBUG_IMPL, 348 "aio fd=%d, bytes=%llu, offset=%llu", 349 fdesc->fd_num, (u_longlong_t)iosize, 350 (u_longlong_t)fileoffset); 351 352 flowop_beginop(threadflow, flowop); 353 if (aio_write64(aiocb) < 0) { 354 filebench_log(LOG_ERROR, "aiowrite failed: %s", 355 strerror(errno)); 356 filebench_shutdown(1); 357 } 358 flowop_endop(threadflow, flowop, iosize); 359 } else { 360 return (FILEBENCH_ERROR); 361 } 362 363 return (FILEBENCH_OK); 364 } 365 366 367 368 #define MAXREAP 4096 369 370 /* 371 * Emulate posix aiowait(). Waits for the completion of half the 372 * outstanding asynchronous IOs, or a single IO, which ever is 373 * larger. The routine will return after a sufficient number of 374 * completed calls issued by any thread in the procflow have 375 * completed, or a 1 second timout elapses. All completed 376 * IO operations are deleted from the thread's aiolist. 377 */ 378 static int 379 fb_lfsflow_aiowait(threadflow_t *threadflow, flowop_t *flowop) 380 { 381 struct aiocb64 **worklist; 382 aiolist_t *aio = flowop->fo_thread->tf_aiolist; 383 int uncompleted = 0; 384 385 worklist = calloc(MAXREAP, sizeof (struct aiocb64 *)); 386 387 /* Count the list of pending aios */ 388 while (aio) { 389 uncompleted++; 390 aio = aio->al_next; 391 } 392 393 do { 394 uint_t ncompleted = 0; 395 uint_t todo; 396 struct timespec timeout; 397 int inprogress; 398 int i; 399 400 /* Wait for half of the outstanding requests */ 401 timeout.tv_sec = 1; 402 timeout.tv_nsec = 0; 403 404 if (uncompleted > MAXREAP) 405 todo = MAXREAP; 406 else 407 todo = uncompleted / 2; 408 409 if (todo == 0) 410 todo = 1; 411 412 flowop_beginop(threadflow, flowop); 413 414 #if (defined(HAVE_AIOWAITN) && defined(USE_PROCESS_MODEL)) 415 if (((aio_waitn64((struct aiocb64 **)worklist, 416 MAXREAP, &todo, &timeout)) == -1) && 417 errno && (errno != ETIME)) { 418 filebench_log(LOG_ERROR, 419 "aiowait failed: %s, outstanding = %d, " 420 "ncompleted = %d ", 421 strerror(errno), uncompleted, todo); 422 } 423 424 ncompleted = todo; 425 /* Take the completed I/Os from the list */ 426 inprogress = 0; 427 for (i = 0; i < ncompleted; i++) { 428 if ((aio_return64(worklist[i]) == -1) && 429 (errno == EINPROGRESS)) { 430 inprogress++; 431 continue; 432 } 433 if (aio_deallocate(flowop, worklist[i]) 434 == FILEBENCH_ERROR) { 435 filebench_log(LOG_ERROR, "Could not remove " 436 "aio from list "); 437 flowop_endop(threadflow, flowop, 0); 438 return (FILEBENCH_ERROR); 439 } 440 } 441 442 uncompleted -= ncompleted; 443 uncompleted += inprogress; 444 445 #else 446 447 for (ncompleted = 0, inprogress = 0, 448 aio = flowop->fo_thread->tf_aiolist; 449 ncompleted < todo, aio != NULL; aio = aio->al_next) { 450 int result = aio_error64(&aio->al_aiocb); 451 452 if (result == EINPROGRESS) { 453 inprogress++; 454 continue; 455 } 456 457 if ((aio_return64(&aio->al_aiocb) == -1) || result) { 458 filebench_log(LOG_ERROR, "aio failed: %s", 459 strerror(result)); 460 continue; 461 } 462 463 ncompleted++; 464 465 if (aio_deallocate(flowop, &aio->al_aiocb) < 0) { 466 filebench_log(LOG_ERROR, "Could not remove " 467 "aio from list "); 468 flowop_endop(threadflow, flowop, 0); 469 return (FILEBENCH_ERROR); 470 } 471 } 472 473 uncompleted -= ncompleted; 474 475 #endif 476 filebench_log(LOG_DEBUG_SCRIPT, 477 "aio2 completed %d ios, uncompleted = %d, inprogress = %d", 478 ncompleted, uncompleted, inprogress); 479 480 } while (uncompleted > MAXREAP); 481 482 flowop_endop(threadflow, flowop, 0); 483 484 free(worklist); 485 486 return (FILEBENCH_OK); 487 } 488 489 #endif /* HAVE_AIO */ 490 491 /* 492 * Does an open64 of a file. Inserts the file descriptor number returned 493 * by open() into the supplied filebench fd. Returns FILEBENCH_OK on 494 * successs, and FILEBENCH_ERROR on failure. 495 */ 496 497 static int 498 fb_lfs_open(fb_fdesc_t *fd, char *path, int flags, int perms) 499 { 500 if ((fd->fd_num = open64(path, flags, perms)) < 0) 501 return (FILEBENCH_ERROR); 502 else 503 return (FILEBENCH_OK); 504 } 505 506 /* 507 * Does an unlink (delete) of a file. 508 */ 509 static int 510 fb_lfs_unlink(char *path) 511 { 512 return (unlink(path)); 513 } 514 515 /* 516 * Does a readlink of a symbolic link. 517 */ 518 static ssize_t 519 fb_lfs_readlink(const char *path, char *buf, size_t buf_size) 520 { 521 return (readlink(path, buf, buf_size)); 522 } 523 524 /* 525 * Does fsync of a file. Returns with fsync return info. 526 */ 527 static int 528 fb_lfs_fsync(fb_fdesc_t *fd) 529 { 530 return (fsync(fd->fd_num)); 531 } 532 533 /* 534 * Do a posix lseek of a file. Return what lseek() returns. 535 */ 536 static int 537 fb_lfs_lseek(fb_fdesc_t *fd, off64_t offset, int whence) 538 { 539 return (lseek64(fd->fd_num, offset, whence)); 540 } 541 542 /* 543 * Do a posix rename of a file. Return what rename() returns. 544 */ 545 static int 546 fb_lfs_rename(const char *old, const char *new) 547 { 548 return (rename(old, new)); 549 } 550 551 552 /* 553 * Do a posix close of a file. Return what close() returns. 554 */ 555 static int 556 fb_lfs_close(fb_fdesc_t *fd) 557 { 558 return (close(fd->fd_num)); 559 } 560 561 /* 562 * Use mkdir to create a directory. 563 */ 564 static int 565 fb_lfs_mkdir(char *path, int perm) 566 { 567 return (mkdir(path, perm)); 568 } 569 570 /* 571 * Use rmdir to delete a directory. Returns what rmdir() returns. 572 */ 573 static int 574 fb_lfs_rmdir(char *path) 575 { 576 return (rmdir(path)); 577 } 578 579 /* 580 * Does a posix opendir(), Returns a directory handle on success, 581 * NULL on failure. 582 */ 583 static DIR * 584 fb_lfs_opendir(char *path) 585 { 586 return (opendir(path)); 587 } 588 589 /* 590 * Does a readdir() call. Returns a pointer to a table of directory 591 * information on success, NULL on failure. 592 */ 593 static struct dirent * 594 fb_lfs_readdir(DIR *dirp) 595 { 596 return (readdir(dirp)); 597 } 598 599 /* 600 * Does a closedir() call. 601 */ 602 static int 603 fb_lfs_closedir(DIR *dirp) 604 { 605 return (closedir(dirp)); 606 } 607 608 /* 609 * Does an fstat of a file. 610 */ 611 static int 612 fb_lfs_fstat(fb_fdesc_t *fd, struct stat64 *statbufp) 613 { 614 return (fstat64(fd->fd_num, statbufp)); 615 } 616 617 /* 618 * Does a stat of a file. 619 */ 620 static int 621 fb_lfs_stat(char *path, struct stat64 *statbufp) 622 { 623 return (stat64(path, statbufp)); 624 } 625 626 /* 627 * Do a pwrite64 to a file. 628 */ 629 static int 630 fb_lfs_pwrite(fb_fdesc_t *fd, caddr_t iobuf, fbint_t iosize, off64_t offset) 631 { 632 return (pwrite64(fd->fd_num, iobuf, iosize, offset)); 633 } 634 635 /* 636 * Do a write to a file. 637 */ 638 static int 639 fb_lfs_write(fb_fdesc_t *fd, caddr_t iobuf, fbint_t iosize) 640 { 641 return (write(fd->fd_num, iobuf, iosize)); 642 } 643 644 /* 645 * Does a truncate operation and returns the result 646 */ 647 static int 648 fb_lfs_truncate(fb_fdesc_t *fd, off64_t fse_size) 649 { 650 #ifdef HAVE_FTRUNCATE64 651 return (ftruncate64(fd->fd_num, fse_size)); 652 #else 653 return (ftruncate(fd->fd_num, (off_t)fse_size)); 654 #endif 655 } 656 657 /* 658 * Does a link operation and returns the result 659 */ 660 static int 661 fb_lfs_link(const char *existing, const char *new) 662 { 663 return (link(existing, new)); 664 } 665 666 /* 667 * Does a symlink operation and returns the result 668 */ 669 static int 670 fb_lfs_symlink(const char *existing, const char *new) 671 { 672 return (symlink(existing, new)); 673 } 674 675 /* 676 * Does an access() check on a file. 677 */ 678 static int 679 fb_lfs_access(const char *path, int amode) 680 { 681 return (access(path, amode)); 682 } 683