1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Portions Copyright 2008 Denis Cheng
26 */
27
28 #include "config.h"
29 #include "filebench.h"
30 #include "flowop.h"
31 #include "threadflow.h" /* For aiolist definition */
32
33 #ifndef HAVE_OFF64_T
34 /*
35 * We are probably on linux.
36 * According to http://www.suse.de/~aj/linux_lfs.html, defining the
37 * above, automatically changes type of off_t to off64_t. so let
38 * us use only off_t as off64_t is not defined
39 */
40 #define off64_t off_t
41 #endif /* HAVE_OFF64_T */
42
43 #include <fcntl.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <unistd.h>
47 #include <libgen.h>
48 #include <sys/mman.h>
49 #include <sys/stat.h>
50 #include <sys/types.h>
51 #include <sys/param.h>
52 #include <sys/resource.h>
53
54 #include "filebench.h"
55 #include "fsplug.h"
56
57 #ifdef HAVE_AIO
58 #include <aio.h>
59 #endif /* HAVE_AIO */
60
61 #ifdef HAVE_LIBAIO_H
62 #include <libaio.h>
63 #endif /* HAVE_LIBAIO_H */
64
65 #ifndef HAVE_AIOCB64_T
66 #define aiocb64 aiocb
67 #endif /* HAVE_AIOCB64_T */
68
69 /*
70 * These routines implement local file access. They are placed into a
71 * vector of functions that are called by all I/O operations in fileset.c
72 * and flowop_library.c. This represents the default file system plug-in,
73 * and may be replaced by vectors for other file system plug-ins.
74 */
75
76 static int fb_lfs_freemem(fb_fdesc_t *fd, off64_t size);
77 static int fb_lfs_open(fb_fdesc_t *, char *, int, int);
78 static int fb_lfs_pread(fb_fdesc_t *, caddr_t, fbint_t, off64_t);
79 static int fb_lfs_read(fb_fdesc_t *, caddr_t, fbint_t);
80 static int fb_lfs_pwrite(fb_fdesc_t *, caddr_t, fbint_t, off64_t);
81 static int fb_lfs_write(fb_fdesc_t *, caddr_t, fbint_t);
82 static int fb_lfs_lseek(fb_fdesc_t *, off64_t, int);
83 static int fb_lfs_truncate(fb_fdesc_t *, off64_t);
84 static int fb_lfs_rename(const char *, const char *);
85 static int fb_lfs_close(fb_fdesc_t *);
86 static int fb_lfs_link(const char *, const char *);
87 static int fb_lfs_symlink(const char *, const char *);
88 static int fb_lfs_unlink(char *);
89 static ssize_t fb_lfs_readlink(const char *, char *, size_t);
90 static int fb_lfs_mkdir(char *, int);
91 static int fb_lfs_rmdir(char *);
92 static DIR *fb_lfs_opendir(char *);
93 static struct dirent *fb_lfs_readdir(DIR *);
94 static int fb_lfs_closedir(DIR *);
95 static int fb_lfs_fsync(fb_fdesc_t *);
96 static int fb_lfs_stat(char *, struct stat64 *);
97 static int fb_lfs_fstat(fb_fdesc_t *, struct stat64 *);
98 static int fb_lfs_access(const char *, int);
99 static void fb_lfs_recur_rm(char *);
100
101 static fsplug_func_t fb_lfs_funcs =
102 {
103 "locfs",
104 fb_lfs_freemem, /* flush page cache */
105 fb_lfs_open, /* open */
106 fb_lfs_pread, /* pread */
107 fb_lfs_read, /* read */
108 fb_lfs_pwrite, /* pwrite */
109 fb_lfs_write, /* write */
110 fb_lfs_lseek, /* lseek */
111 fb_lfs_truncate, /* ftruncate */
112 fb_lfs_rename, /* rename */
113 fb_lfs_close, /* close */
114 fb_lfs_link, /* link */
115 fb_lfs_symlink, /* symlink */
116 fb_lfs_unlink, /* unlink */
117 fb_lfs_readlink, /* readlink */
118 fb_lfs_mkdir, /* mkdir */
119 fb_lfs_rmdir, /* rmdir */
120 fb_lfs_opendir, /* opendir */
121 fb_lfs_readdir, /* readdir */
122 fb_lfs_closedir, /* closedir */
123 fb_lfs_fsync, /* fsync */
124 fb_lfs_stat, /* stat */
125 fb_lfs_fstat, /* fstat */
126 fb_lfs_access, /* access */
127 fb_lfs_recur_rm /* recursive rm */
128 };
129
130 #ifdef HAVE_AIO
131 /*
132 * Local file system asynchronous IO flowops are in this module, as
133 * they have a number of local file system specific features.
134 */
135 static int fb_lfsflow_aiowrite(threadflow_t *threadflow, flowop_t *flowop);
136 static int fb_lfsflow_aiowait(threadflow_t *threadflow, flowop_t *flowop);
137
138 static flowop_proto_t fb_lfsflow_funcs[] = {
139 FLOW_TYPE_AIO, FLOW_ATTR_WRITE, "aiowrite", flowop_init_generic,
140 fb_lfsflow_aiowrite, flowop_destruct_generic,
141 FLOW_TYPE_AIO, 0, "aiowait", flowop_init_generic,
142 fb_lfsflow_aiowait, flowop_destruct_generic
143 };
144
145 #endif /* HAVE_AIO */
146
147 /*
148 * Initialize this processes I/O functions vector to point to
149 * the vector of local file system I/O functions
150 */
151 void
fb_lfs_funcvecinit(void)152 fb_lfs_funcvecinit(void)
153 {
154 fs_functions_vec = &fb_lfs_funcs;
155 }
156
157 /*
158 * Initialize those flowops whose implementation is file system
159 * specific.
160 */
161 void
fb_lfs_flowinit(void)162 fb_lfs_flowinit(void)
163 {
164 int nops;
165
166 /*
167 * re-initialize the I/O functions vector while we are at
168 * it as it may have been redefined since the process was
169 * created, at least if this is the master processes
170 */
171 fb_lfs_funcvecinit();
172
173 #ifdef HAVE_AIO
174 nops = sizeof (fb_lfsflow_funcs) / sizeof (flowop_proto_t);
175 flowop_flow_init(fb_lfsflow_funcs, nops);
176 #endif /* HAVE_AIO */
177 }
178
179 /*
180 * Frees up memory mapped file region of supplied size. The
181 * file descriptor "fd" indicates which memory mapped file.
182 * If successful, returns 0. Otherwise returns -1 if "size"
183 * is zero, or -1 times the number of times msync() failed.
184 */
185 static int
fb_lfs_freemem(fb_fdesc_t * fd,off64_t size)186 fb_lfs_freemem(fb_fdesc_t *fd, off64_t size)
187 {
188 off64_t left;
189 int ret = 0;
190
191 for (left = size; left > 0; left -= MMAP_SIZE) {
192 off64_t thismapsize;
193 caddr_t addr;
194
195 thismapsize = MIN(MMAP_SIZE, left);
196 addr = mmap64(0, thismapsize, PROT_READ|PROT_WRITE,
197 MAP_SHARED, fd->fd_num, size - left);
198 ret += msync(addr, thismapsize, MS_INVALIDATE);
199 (void) munmap(addr, thismapsize);
200 }
201 return (ret);
202 }
203
204 /*
205 * Does a posix pread. Returns what the pread() returns.
206 */
207 static int
fb_lfs_pread(fb_fdesc_t * fd,caddr_t iobuf,fbint_t iosize,off64_t fileoffset)208 fb_lfs_pread(fb_fdesc_t *fd, caddr_t iobuf, fbint_t iosize, off64_t fileoffset)
209 {
210 return (pread64(fd->fd_num, iobuf, iosize, fileoffset));
211 }
212
213 /*
214 * Does a posix read. Returns what the read() returns.
215 */
216 static int
fb_lfs_read(fb_fdesc_t * fd,caddr_t iobuf,fbint_t iosize)217 fb_lfs_read(fb_fdesc_t *fd, caddr_t iobuf, fbint_t iosize)
218 {
219 return (read(fd->fd_num, iobuf, iosize));
220 }
221
222 #ifdef HAVE_AIO
223
224 /*
225 * Asynchronous write section. An Asynchronous IO element
226 * (aiolist_t) is used to associate the asynchronous write request with
227 * its subsequent completion. This element includes a aiocb64 struct
228 * that is used by posix aio_xxx calls to track the asynchronous writes.
229 * The flowops aiowrite and aiowait result in calls to these posix
230 * aio_xxx system routines to do the actual asynchronous write IO
231 * operations.
232 */
233
234
235 /*
236 * Allocates an asynchronous I/O list (aio, of type
237 * aiolist_t) element. Adds it to the flowop thread's
238 * threadflow aio list. Returns a pointer to the element.
239 */
240 static aiolist_t *
aio_allocate(flowop_t * flowop)241 aio_allocate(flowop_t *flowop)
242 {
243 aiolist_t *aiolist;
244
245 if ((aiolist = malloc(sizeof (aiolist_t))) == NULL) {
246 filebench_log(LOG_ERROR, "malloc aiolist failed");
247 filebench_shutdown(1);
248 }
249
250 /* Add to list */
251 if (flowop->fo_thread->tf_aiolist == NULL) {
252 flowop->fo_thread->tf_aiolist = aiolist;
253 aiolist->al_next = NULL;
254 } else {
255 aiolist->al_next = flowop->fo_thread->tf_aiolist;
256 flowop->fo_thread->tf_aiolist = aiolist;
257 }
258 return (aiolist);
259 }
260
261 /*
262 * Searches for the aiolist element that has a matching
263 * completion block, aiocb. If none found returns FILEBENCH_ERROR. If
264 * found, removes the aiolist element from flowop thread's
265 * list and returns FILEBENCH_OK.
266 */
267 static int
aio_deallocate(flowop_t * flowop,struct aiocb64 * aiocb)268 aio_deallocate(flowop_t *flowop, struct aiocb64 *aiocb)
269 {
270 aiolist_t *aiolist = flowop->fo_thread->tf_aiolist;
271 aiolist_t *previous = NULL;
272 aiolist_t *match = NULL;
273
274 if (aiocb == NULL) {
275 filebench_log(LOG_ERROR, "null aiocb deallocate");
276 return (FILEBENCH_OK);
277 }
278
279 while (aiolist) {
280 if (aiocb == &(aiolist->al_aiocb)) {
281 match = aiolist;
282 break;
283 }
284 previous = aiolist;
285 aiolist = aiolist->al_next;
286 }
287
288 if (match == NULL)
289 return (FILEBENCH_ERROR);
290
291 /* Remove from the list */
292 if (previous)
293 previous->al_next = match->al_next;
294 else
295 flowop->fo_thread->tf_aiolist = match->al_next;
296
297 return (FILEBENCH_OK);
298 }
299
300 /*
301 * Emulate posix aiowrite(). Determines which file to use,
302 * either one file of a fileset, or the file associated
303 * with a fileobj, allocates and fills an aiolist_t element
304 * for the write, and issues the asynchronous write. This
305 * operation is only valid for random IO, and returns an
306 * error if the flowop is set for sequential IO. Returns
307 * FILEBENCH_OK on success, FILEBENCH_NORSC if iosetup can't
308 * obtain a file to open, and FILEBENCH_ERROR on any
309 * encountered error.
310 */
311 static int
fb_lfsflow_aiowrite(threadflow_t * threadflow,flowop_t * flowop)312 fb_lfsflow_aiowrite(threadflow_t *threadflow, flowop_t *flowop)
313 {
314 caddr_t iobuf;
315 fbint_t wss;
316 fbint_t iosize;
317 fb_fdesc_t *fdesc;
318 int ret;
319
320 iosize = avd_get_int(flowop->fo_iosize);
321
322 if ((ret = flowoplib_iosetup(threadflow, flowop, &wss, &iobuf,
323 &fdesc, iosize)) != FILEBENCH_OK)
324 return (ret);
325
326 if (avd_get_bool(flowop->fo_random)) {
327 uint64_t fileoffset;
328 struct aiocb64 *aiocb;
329 aiolist_t *aiolist;
330
331 if (filebench_randomno64(&fileoffset,
332 wss, iosize, NULL) == -1) {
333 filebench_log(LOG_ERROR,
334 "file size smaller than IO size for thread %s",
335 flowop->fo_name);
336 return (FILEBENCH_ERROR);
337 }
338
339 aiolist = aio_allocate(flowop);
340 aiolist->al_type = AL_WRITE;
341 aiocb = &aiolist->al_aiocb;
342
343 aiocb->aio_fildes = fdesc->fd_num;
344 aiocb->aio_buf = iobuf;
345 aiocb->aio_nbytes = (size_t)iosize;
346 aiocb->aio_offset = (off64_t)fileoffset;
347 aiocb->aio_reqprio = 0;
348
349 filebench_log(LOG_DEBUG_IMPL,
350 "aio fd=%d, bytes=%llu, offset=%llu",
351 fdesc->fd_num, (u_longlong_t)iosize,
352 (u_longlong_t)fileoffset);
353
354 flowop_beginop(threadflow, flowop);
355 if (aio_write64(aiocb) < 0) {
356 filebench_log(LOG_ERROR, "aiowrite failed: %s",
357 strerror(errno));
358 filebench_shutdown(1);
359 }
360 flowop_endop(threadflow, flowop, iosize);
361 } else {
362 return (FILEBENCH_ERROR);
363 }
364
365 return (FILEBENCH_OK);
366 }
367
368
369
370 #define MAXREAP 4096
371
372 /*
373 * Emulate posix aiowait(). Waits for the completion of half the
374 * outstanding asynchronous IOs, or a single IO, which ever is
375 * larger. The routine will return after a sufficient number of
376 * completed calls issued by any thread in the procflow have
377 * completed, or a 1 second timout elapses. All completed
378 * IO operations are deleted from the thread's aiolist.
379 */
380 static int
fb_lfsflow_aiowait(threadflow_t * threadflow,flowop_t * flowop)381 fb_lfsflow_aiowait(threadflow_t *threadflow, flowop_t *flowop)
382 {
383 struct aiocb64 **worklist;
384 aiolist_t *aio = flowop->fo_thread->tf_aiolist;
385 int uncompleted = 0;
386
387 worklist = calloc(MAXREAP, sizeof (struct aiocb64 *));
388
389 /* Count the list of pending aios */
390 while (aio) {
391 uncompleted++;
392 aio = aio->al_next;
393 }
394
395 do {
396 uint_t ncompleted = 0;
397 uint_t todo;
398 struct timespec timeout;
399 int inprogress;
400 int i;
401
402 /* Wait for half of the outstanding requests */
403 timeout.tv_sec = 1;
404 timeout.tv_nsec = 0;
405
406 if (uncompleted > MAXREAP)
407 todo = MAXREAP;
408 else
409 todo = uncompleted / 2;
410
411 if (todo == 0)
412 todo = 1;
413
414 flowop_beginop(threadflow, flowop);
415
416 #if (defined(HAVE_AIOWAITN) && defined(USE_PROCESS_MODEL))
417 if (((aio_waitn64((struct aiocb64 **)worklist,
418 MAXREAP, &todo, &timeout)) == -1) &&
419 errno && (errno != ETIME)) {
420 filebench_log(LOG_ERROR,
421 "aiowait failed: %s, outstanding = %d, "
422 "ncompleted = %d ",
423 strerror(errno), uncompleted, todo);
424 }
425
426 ncompleted = todo;
427 /* Take the completed I/Os from the list */
428 inprogress = 0;
429 for (i = 0; i < ncompleted; i++) {
430 if ((aio_return64(worklist[i]) == -1) &&
431 (errno == EINPROGRESS)) {
432 inprogress++;
433 continue;
434 }
435 if (aio_deallocate(flowop, worklist[i])
436 == FILEBENCH_ERROR) {
437 filebench_log(LOG_ERROR, "Could not remove "
438 "aio from list ");
439 flowop_endop(threadflow, flowop, 0);
440 return (FILEBENCH_ERROR);
441 }
442 }
443
444 uncompleted -= ncompleted;
445 uncompleted += inprogress;
446
447 #else
448
449 for (ncompleted = 0, inprogress = 0,
450 aio = flowop->fo_thread->tf_aiolist;
451 ncompleted < todo, aio != NULL; aio = aio->al_next) {
452 int result = aio_error64(&aio->al_aiocb);
453
454 if (result == EINPROGRESS) {
455 inprogress++;
456 continue;
457 }
458
459 if ((aio_return64(&aio->al_aiocb) == -1) || result) {
460 filebench_log(LOG_ERROR, "aio failed: %s",
461 strerror(result));
462 continue;
463 }
464
465 ncompleted++;
466
467 if (aio_deallocate(flowop, &aio->al_aiocb) < 0) {
468 filebench_log(LOG_ERROR, "Could not remove "
469 "aio from list ");
470 flowop_endop(threadflow, flowop, 0);
471 return (FILEBENCH_ERROR);
472 }
473 }
474
475 uncompleted -= ncompleted;
476
477 #endif
478 filebench_log(LOG_DEBUG_SCRIPT,
479 "aio2 completed %d ios, uncompleted = %d, inprogress = %d",
480 ncompleted, uncompleted, inprogress);
481
482 } while (uncompleted > MAXREAP);
483
484 flowop_endop(threadflow, flowop, 0);
485
486 free(worklist);
487
488 return (FILEBENCH_OK);
489 }
490
491 #endif /* HAVE_AIO */
492
493 /*
494 * Does an open64 of a file. Inserts the file descriptor number returned
495 * by open() into the supplied filebench fd. Returns FILEBENCH_OK on
496 * successs, and FILEBENCH_ERROR on failure.
497 */
498
499 static int
fb_lfs_open(fb_fdesc_t * fd,char * path,int flags,int perms)500 fb_lfs_open(fb_fdesc_t *fd, char *path, int flags, int perms)
501 {
502 if ((fd->fd_num = open64(path, flags, perms)) < 0)
503 return (FILEBENCH_ERROR);
504 else
505 return (FILEBENCH_OK);
506 }
507
508 /*
509 * Does an unlink (delete) of a file.
510 */
511 static int
fb_lfs_unlink(char * path)512 fb_lfs_unlink(char *path)
513 {
514 return (unlink(path));
515 }
516
517 /*
518 * Does a readlink of a symbolic link.
519 */
520 static ssize_t
fb_lfs_readlink(const char * path,char * buf,size_t buf_size)521 fb_lfs_readlink(const char *path, char *buf, size_t buf_size)
522 {
523 return (readlink(path, buf, buf_size));
524 }
525
526 /*
527 * Does fsync of a file. Returns with fsync return info.
528 */
529 static int
fb_lfs_fsync(fb_fdesc_t * fd)530 fb_lfs_fsync(fb_fdesc_t *fd)
531 {
532 return (fsync(fd->fd_num));
533 }
534
535 /*
536 * Do a posix lseek of a file. Return what lseek() returns.
537 */
538 static int
fb_lfs_lseek(fb_fdesc_t * fd,off64_t offset,int whence)539 fb_lfs_lseek(fb_fdesc_t *fd, off64_t offset, int whence)
540 {
541 return (lseek64(fd->fd_num, offset, whence));
542 }
543
544 /*
545 * Do a posix rename of a file. Return what rename() returns.
546 */
547 static int
fb_lfs_rename(const char * old,const char * new)548 fb_lfs_rename(const char *old, const char *new)
549 {
550 return (rename(old, new));
551 }
552
553
554 /*
555 * Do a posix close of a file. Return what close() returns.
556 */
557 static int
fb_lfs_close(fb_fdesc_t * fd)558 fb_lfs_close(fb_fdesc_t *fd)
559 {
560 return (close(fd->fd_num));
561 }
562
563 /*
564 * Use mkdir to create a directory.
565 */
566 static int
fb_lfs_mkdir(char * path,int perm)567 fb_lfs_mkdir(char *path, int perm)
568 {
569 return (mkdir(path, perm));
570 }
571
572 /*
573 * Use rmdir to delete a directory. Returns what rmdir() returns.
574 */
575 static int
fb_lfs_rmdir(char * path)576 fb_lfs_rmdir(char *path)
577 {
578 return (rmdir(path));
579 }
580
581 /*
582 * does a recursive rm to remove an entire directory tree (i.e. a fileset).
583 * Supplied with the path to the root of the tree.
584 */
585 static void
fb_lfs_recur_rm(char * path)586 fb_lfs_recur_rm(char *path)
587 {
588 char cmd[MAXPATHLEN];
589
590 (void) snprintf(cmd, sizeof (cmd), "rm -rf %s", path);
591 (void) system(cmd);
592 }
593
594 /*
595 * Does a posix opendir(), Returns a directory handle on success,
596 * NULL on failure.
597 */
598 static DIR *
fb_lfs_opendir(char * path)599 fb_lfs_opendir(char *path)
600 {
601 return (opendir(path));
602 }
603
604 /*
605 * Does a readdir() call. Returns a pointer to a table of directory
606 * information on success, NULL on failure.
607 */
608 static struct dirent *
fb_lfs_readdir(DIR * dirp)609 fb_lfs_readdir(DIR *dirp)
610 {
611 return (readdir(dirp));
612 }
613
614 /*
615 * Does a closedir() call.
616 */
617 static int
fb_lfs_closedir(DIR * dirp)618 fb_lfs_closedir(DIR *dirp)
619 {
620 return (closedir(dirp));
621 }
622
623 /*
624 * Does an fstat of a file.
625 */
626 static int
fb_lfs_fstat(fb_fdesc_t * fd,struct stat64 * statbufp)627 fb_lfs_fstat(fb_fdesc_t *fd, struct stat64 *statbufp)
628 {
629 return (fstat64(fd->fd_num, statbufp));
630 }
631
632 /*
633 * Does a stat of a file.
634 */
635 static int
fb_lfs_stat(char * path,struct stat64 * statbufp)636 fb_lfs_stat(char *path, struct stat64 *statbufp)
637 {
638 return (stat64(path, statbufp));
639 }
640
641 /*
642 * Do a pwrite64 to a file.
643 */
644 static int
fb_lfs_pwrite(fb_fdesc_t * fd,caddr_t iobuf,fbint_t iosize,off64_t offset)645 fb_lfs_pwrite(fb_fdesc_t *fd, caddr_t iobuf, fbint_t iosize, off64_t offset)
646 {
647 return (pwrite64(fd->fd_num, iobuf, iosize, offset));
648 }
649
650 /*
651 * Do a write to a file.
652 */
653 static int
fb_lfs_write(fb_fdesc_t * fd,caddr_t iobuf,fbint_t iosize)654 fb_lfs_write(fb_fdesc_t *fd, caddr_t iobuf, fbint_t iosize)
655 {
656 return (write(fd->fd_num, iobuf, iosize));
657 }
658
659 /*
660 * Does a truncate operation and returns the result
661 */
662 static int
fb_lfs_truncate(fb_fdesc_t * fd,off64_t fse_size)663 fb_lfs_truncate(fb_fdesc_t *fd, off64_t fse_size)
664 {
665 #ifdef HAVE_FTRUNCATE64
666 return (ftruncate64(fd->fd_num, fse_size));
667 #else
668 return (ftruncate(fd->fd_num, (off_t)fse_size));
669 #endif
670 }
671
672 /*
673 * Does a link operation and returns the result
674 */
675 static int
fb_lfs_link(const char * existing,const char * new)676 fb_lfs_link(const char *existing, const char *new)
677 {
678 return (link(existing, new));
679 }
680
681 /*
682 * Does a symlink operation and returns the result
683 */
684 static int
fb_lfs_symlink(const char * existing,const char * new)685 fb_lfs_symlink(const char *existing, const char *new)
686 {
687 return (symlink(existing, new));
688 }
689
690 /*
691 * Does an access() check on a file.
692 */
693 static int
fb_lfs_access(const char * path,int amode)694 fb_lfs_access(const char *path, int amode)
695 {
696 return (access(path, amode));
697 }
698