1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 #pragma ident "%Z%%M% %I% %E% SMI"
22
23 /*
24 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
25 * Use is subject to license terms.
26 */
27
28 #include <sys/systm.h>
29 #include <sys/types.h>
30 #include <sys/vnode.h>
31 #include <sys/errno.h>
32 #include <sys/sysmacros.h>
33 #include <sys/debug.h>
34 #include <sys/kmem.h>
35 #include <sys/conf.h>
36 #include <sys/proc.h>
37 #include <sys/cmn_err.h>
38 #include <sys/fs/ufs_inode.h>
39 #include <sys/fs/ufs_filio.h>
40 #include <sys/fs/ufs_log.h>
41 #include <sys/inttypes.h>
42 #include <sys/atomic.h>
43 #include <sys/tuneable.h>
44
45 /*
46 * externs
47 */
48 extern pri_t minclsyspri;
49 extern struct kmem_cache *lufs_bp;
50 extern int ufs_trans_push_quota();
51
52 /*
53 * globals
54 */
55 kmem_cache_t *mapentry_cache;
56
57 /*
58 * logmap tuning constants
59 */
60 long logmap_maxnme_commit = 2048;
61 long logmap_maxnme_async = 4096;
62 long logmap_maxnme_sync = 6144;
63 long logmap_maxcfrag_commit = 4; /* Max canceled fragments per moby */
64
65
66 uint64_t ufs_crb_size = 0; /* current size of all crb buffers */
67 uint64_t ufs_crb_max_size = 0; /* highest crb buffer use so far */
68 size_t ufs_crb_limit; /* max allowable size for crbs */
69 uint64_t ufs_crb_alloc_fails = 0; /* crb allocation failures stat */
70 #define UFS_MAX_CRB_DEFAULT_DIVISOR 10 /* max 1/10 kmem_maxavail() */
71 int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */
72 void handle_dquot(mapentry_t *);
73
74 /*
75 * GENERIC MAP ROUTINES
76 */
77
78 #define CRB_FREE(crb, me) \
79 kmem_free(crb->c_buf, crb->c_nb); \
80 atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \
81 kmem_free(crb, sizeof (crb_t)); \
82 (me)->me_crb = NULL;
83
84 #define CRB_RELE(me) { \
85 crb_t *crb = (me)->me_crb; \
86 if (crb && (--crb->c_refcnt == 0)) { \
87 CRB_FREE(crb, me) \
88 } \
89 }
90
91 /*
92 * Check that the old delta has an argument and a push function of
93 * ufs_trans_push_quota(), then check that the old and new deltas differ.
94 * If so we clean up with handle_dquot() before replacing the old delta.
95 */
96 #define HANDLE_DQUOT(me, melist) { \
97 if ((me->me_arg) && \
98 (me->me_func == ufs_trans_push_quota)) { \
99 if (!((me->me_dt == melist->me_dt) && \
100 (me->me_arg == melist->me_arg) && \
101 (me->me_func == melist->me_func))) { \
102 handle_dquot(me); \
103 } \
104 } \
105 }
106
107 /*
108 * free up all the mapentries for a map
109 */
110 void
map_free_entries(mt_map_t * mtm)111 map_free_entries(mt_map_t *mtm)
112 {
113 int i;
114 mapentry_t *me;
115
116 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
117 me->me_next->me_prev = me->me_prev;
118 me->me_prev->me_next = me->me_next;
119 CRB_RELE(me);
120 kmem_cache_free(mapentry_cache, me);
121 }
122 for (i = 0; i < mtm->mtm_nhash; i++)
123 mtm->mtm_hash[i] = NULL;
124 mtm->mtm_nme = 0;
125 mtm->mtm_nmet = 0;
126 }
127
128 /*
129 * done with map; free if necessary
130 */
131 mt_map_t *
map_put(mt_map_t * mtm)132 map_put(mt_map_t *mtm)
133 {
134 /*
135 * free up the map's memory
136 */
137 map_free_entries(mtm);
138 ASSERT(map_put_debug(mtm));
139 kmem_free(mtm->mtm_hash,
140 (size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash));
141 mutex_destroy(&mtm->mtm_mutex);
142 mutex_destroy(&mtm->mtm_scan_mutex);
143 cv_destroy(&mtm->mtm_to_roll_cv);
144 cv_destroy(&mtm->mtm_from_roll_cv);
145 rw_destroy(&mtm->mtm_rwlock);
146 mutex_destroy(&mtm->mtm_lock);
147 cv_destroy(&mtm->mtm_cv_commit);
148 cv_destroy(&mtm->mtm_cv_next);
149 cv_destroy(&mtm->mtm_cv_eot);
150 cv_destroy(&mtm->mtm_cv);
151 kmem_free(mtm, sizeof (mt_map_t));
152 return (NULL);
153 }
154 /*
155 * Allocate a map;
156 */
157 mt_map_t *
map_get(ml_unit_t * ul,enum maptypes maptype,int nh)158 map_get(ml_unit_t *ul, enum maptypes maptype, int nh)
159 {
160 mt_map_t *mtm;
161
162 /*
163 * assume the map is not here and allocate the necessary structs
164 */
165 mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP);
166 mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL);
167 mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL);
168 cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL);
169 cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL);
170 rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL);
171 mtm->mtm_next = (mapentry_t *)mtm;
172 mtm->mtm_prev = (mapentry_t *)mtm;
173 mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh),
174 KM_SLEEP);
175 mtm->mtm_nhash = nh;
176 mtm->mtm_debug = ul->un_debug;
177 mtm->mtm_type = maptype;
178
179 mtm->mtm_cfrags = 0;
180 mtm->mtm_cfragmax = logmap_maxcfrag_commit;
181
182 /*
183 * for scan test
184 */
185 mtm->mtm_ul = ul;
186
187 /*
188 * Initialize locks
189 */
190 mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL);
191 cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL);
192 cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL);
193 cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL);
194 cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL);
195 ASSERT(map_get_debug(ul, mtm));
196
197 return (mtm);
198 }
199
200 /*
201 * DELTAMAP ROUTINES
202 */
203 /*
204 * deltamap tuning constants
205 */
206 long deltamap_maxnme = 1024; /* global so it can be set */
207
208 int
deltamap_need_commit(mt_map_t * mtm)209 deltamap_need_commit(mt_map_t *mtm)
210 {
211 return (mtm->mtm_nme > deltamap_maxnme);
212 }
213
214 /*
215 * put a delta into a deltamap; may sleep on memory
216 */
217 void
deltamap_add(mt_map_t * mtm,offset_t mof,off_t nb,delta_t dtyp,int (* func)(),ulong_t arg,threadtrans_t * tp)218 deltamap_add(
219 mt_map_t *mtm,
220 offset_t mof,
221 off_t nb,
222 delta_t dtyp,
223 int (*func)(),
224 ulong_t arg,
225 threadtrans_t *tp)
226 {
227 int32_t hnb;
228 mapentry_t *me;
229 mapentry_t **mep;
230
231 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
232 map_check_linkage(mtm));
233
234 mutex_enter(&mtm->mtm_mutex);
235
236 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
237 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
238 if (hnb > nb)
239 hnb = nb;
240 /*
241 * Search for dup entry. We need to ensure that we don't
242 * replace a map entry which carries quota information
243 * with a map entry which doesn't. In that case we lose
244 * reference the the dquot structure which will not be
245 * cleaned up by the push function me->me_func as this will
246 * never be called.
247 * The stray dquot would be found later by invalidatedq()
248 * causing a panic when the filesystem is unmounted.
249 */
250 mep = MAP_HASH(mof, mtm);
251 for (me = *mep; me; me = me->me_hash) {
252 if (DATAwithinME(mof, hnb, me)) {
253 /*
254 * Don't remove quota entries which have
255 * incremented the ref count (those with a
256 * ufs_trans_push_quota push function).
257 * Let logmap_add[_buf] clean them up.
258 */
259 if (me->me_func == ufs_trans_push_quota) {
260 continue;
261 }
262 break;
263 }
264 ASSERT((dtyp == DT_CANCEL) ||
265 (!DATAoverlapME(mof, hnb, me)) ||
266 MEwithinDATA(me, mof, hnb));
267 }
268
269 if (me) {
270 /* already in map */
271 continue;
272 }
273
274 /*
275 * Add up all the delta map deltas so we can compute
276 * an upper bound on the log size used.
277 * Note, some deltas get removed from the deltamap
278 * before the deltamap_push by lufs_write_strategy
279 * and so multiple deltas to the same mof offset
280 * don't get cancelled here but in the logmap.
281 * Thus we can't easily get a accurate count of
282 * the log space used - only an upper bound.
283 */
284 if (tp && (mtm->mtm_ul->un_deltamap == mtm)) {
285 ASSERT(dtyp != DT_CANCEL);
286 if (dtyp == DT_ABZERO) {
287 tp->deltas_size += sizeof (struct delta);
288 } else {
289 tp->deltas_size +=
290 (hnb + sizeof (struct delta));
291 }
292 }
293
294 delta_stats[dtyp]++;
295
296 /*
297 * get a mapentry
298 * May need to drop & re-grab the mtm_mutex
299 * and then recheck for a duplicate
300 */
301 me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP);
302 if (me == NULL) {
303 mutex_exit(&mtm->mtm_mutex);
304 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
305 mutex_enter(&mtm->mtm_mutex);
306 }
307 bzero(me, sizeof (mapentry_t));
308
309 /*
310 * initialize and put in deltamap
311 */
312 me->me_mof = mof;
313 me->me_nb = hnb;
314 me->me_func = func;
315 me->me_arg = arg;
316 me->me_dt = dtyp;
317 me->me_flags = ME_HASH;
318 me->me_tid = mtm->mtm_tid;
319
320 me->me_hash = *mep;
321 *mep = me;
322 me->me_next = (mapentry_t *)mtm;
323 me->me_prev = mtm->mtm_prev;
324 mtm->mtm_prev->me_next = me;
325 mtm->mtm_prev = me;
326 mtm->mtm_nme++;
327 }
328 mutex_exit(&mtm->mtm_mutex);
329
330 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
331 map_check_linkage(mtm));
332 }
333
334 /*
335 * remove deltas within (mof, nb) and return as linked list
336 */
337 mapentry_t *
deltamap_remove(mt_map_t * mtm,offset_t mof,off_t nb)338 deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb)
339 {
340 off_t hnb;
341 mapentry_t *me;
342 mapentry_t **mep;
343 mapentry_t *mer;
344
345 if (mtm == NULL)
346 return (NULL);
347
348 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
349 map_check_linkage(mtm));
350
351 mutex_enter(&mtm->mtm_mutex);
352 for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) {
353 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
354 if (hnb > nb)
355 hnb = nb;
356 /*
357 * remove entries from hash and return as a aged linked list
358 */
359 mep = MAP_HASH(mof, mtm);
360 while ((me = *mep) != 0) {
361 if (MEwithinDATA(me, mof, hnb)) {
362 *mep = me->me_hash;
363 me->me_next->me_prev = me->me_prev;
364 me->me_prev->me_next = me->me_next;
365 me->me_hash = mer;
366 mer = me;
367 me->me_flags |= ME_LIST;
368 me->me_flags &= ~ME_HASH;
369 mtm->mtm_nme--;
370 } else
371 mep = &me->me_hash;
372 }
373 }
374 mutex_exit(&mtm->mtm_mutex);
375
376 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
377 map_check_linkage(mtm));
378
379 return (mer);
380 }
381
382 /*
383 * delete entries within (mof, nb)
384 */
385 void
deltamap_del(mt_map_t * mtm,offset_t mof,off_t nb)386 deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb)
387 {
388 mapentry_t *me;
389 mapentry_t *menext;
390
391 menext = deltamap_remove(mtm, mof, nb);
392 while ((me = menext) != 0) {
393 menext = me->me_hash;
394 kmem_cache_free(mapentry_cache, me);
395 }
396 }
397
398 /*
399 * Call the indicated function to cause deltas to move to the logmap.
400 * top_end_sync() is the only caller of this function and
401 * it has waited for the completion of all threads, so there can
402 * be no other activity in the deltamap. Therefore we don't need to
403 * hold the deltamap lock.
404 */
405 void
deltamap_push(ml_unit_t * ul)406 deltamap_push(ml_unit_t *ul)
407 {
408 delta_t dtyp;
409 int (*func)();
410 ulong_t arg;
411 mapentry_t *me;
412 offset_t mof;
413 off_t nb;
414 mt_map_t *mtm = ul->un_deltamap;
415
416 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
417 map_check_linkage(mtm));
418
419 /*
420 * for every entry in the deltamap
421 */
422 while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
423 ASSERT(me->me_func);
424 func = me->me_func;
425 dtyp = me->me_dt;
426 arg = me->me_arg;
427 mof = me->me_mof;
428 nb = me->me_nb;
429 if ((ul->un_flags & LDL_ERROR) ||
430 (*func)(ul->un_ufsvfs, dtyp, arg))
431 deltamap_del(mtm, mof, nb);
432 }
433
434 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
435 map_check_linkage(mtm));
436 }
437
438 /*
439 * LOGMAP ROUTINES
440 */
441
442 int
logmap_need_commit(mt_map_t * mtm)443 logmap_need_commit(mt_map_t *mtm)
444 {
445 return ((mtm->mtm_nmet > logmap_maxnme_commit) ||
446 (mtm->mtm_cfrags >= mtm->mtm_cfragmax));
447 }
448
449 int
logmap_need_roll_async(mt_map_t * mtm)450 logmap_need_roll_async(mt_map_t *mtm)
451 {
452 return (mtm->mtm_nme > logmap_maxnme_async);
453 }
454
455 int
logmap_need_roll_sync(mt_map_t * mtm)456 logmap_need_roll_sync(mt_map_t *mtm)
457 {
458 return (mtm->mtm_nme > logmap_maxnme_sync);
459 }
460
461 void
logmap_start_roll(ml_unit_t * ul)462 logmap_start_roll(ml_unit_t *ul)
463 {
464 mt_map_t *logmap = ul->un_logmap;
465
466 logmap_settail(logmap, ul);
467 ASSERT(!(ul->un_flags & LDL_NOROLL));
468 mutex_enter(&logmap->mtm_mutex);
469 if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) {
470 logmap->mtm_flags |= MTM_ROLL_RUNNING;
471 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT);
472 (void) thread_create(NULL, 0, trans_roll, ul, 0, &p0,
473 TS_RUN, minclsyspri);
474 }
475 mutex_exit(&logmap->mtm_mutex);
476 }
477
478 void
logmap_kill_roll(ml_unit_t * ul)479 logmap_kill_roll(ml_unit_t *ul)
480 {
481 mt_map_t *mtm = ul->un_logmap;
482
483 if (mtm == NULL)
484 return;
485
486 mutex_enter(&mtm->mtm_mutex);
487
488 while (mtm->mtm_flags & MTM_ROLL_RUNNING) {
489 mtm->mtm_flags |= MTM_ROLL_EXIT;
490 cv_signal(&mtm->mtm_to_roll_cv);
491 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
492 }
493 mutex_exit(&mtm->mtm_mutex);
494 }
495
496 /*
497 * kick the roll thread if it's not doing anything
498 */
499 void
logmap_forceroll_nowait(mt_map_t * logmap)500 logmap_forceroll_nowait(mt_map_t *logmap)
501 {
502 /*
503 * Don't need to lock mtm_mutex to read mtm_flags here as we
504 * don't care in the rare case when we get a transitional value
505 * of mtm_flags. Just by signalling the thread it will wakeup
506 * and notice it has too many logmap entries.
507 */
508 ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL));
509 if ((logmap->mtm_flags & MTM_ROLLING) == 0) {
510 cv_signal(&logmap->mtm_to_roll_cv);
511 }
512 }
513
514 /*
515 * kick the roll thread and wait for it to finish a cycle
516 */
517 void
logmap_forceroll(mt_map_t * mtm)518 logmap_forceroll(mt_map_t *mtm)
519 {
520 mutex_enter(&mtm->mtm_mutex);
521 if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) {
522 mtm->mtm_flags |= MTM_FORCE_ROLL;
523 cv_signal(&mtm->mtm_to_roll_cv);
524 }
525 do {
526 if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) {
527 mtm->mtm_flags &= ~MTM_FORCE_ROLL;
528 goto out;
529 }
530 cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
531 } while (mtm->mtm_flags & MTM_FORCE_ROLL);
532 out:
533 mutex_exit(&mtm->mtm_mutex);
534 }
535
536 /*
537 * remove rolled deltas within (mof, nb) and free them
538 */
539 void
logmap_remove_roll(mt_map_t * mtm,offset_t mof,off_t nb)540 logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb)
541 {
542 int dolock = 0;
543 off_t hnb;
544 mapentry_t *me;
545 mapentry_t **mep;
546 offset_t savmof = mof;
547 off_t savnb = nb;
548
549 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
550 map_check_linkage(mtm));
551
552 again:
553 if (dolock)
554 rw_enter(&mtm->mtm_rwlock, RW_WRITER);
555 mutex_enter(&mtm->mtm_mutex);
556 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
557 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
558 if (hnb > nb)
559 hnb = nb;
560 /*
561 * remove and free the rolled entries
562 */
563 mep = MAP_HASH(mof, mtm);
564 while ((me = *mep) != 0) {
565 if ((me->me_flags & ME_ROLL) &&
566 (MEwithinDATA(me, mof, hnb))) {
567 if (me->me_flags & ME_AGE) {
568 ASSERT(dolock == 0);
569 dolock = 1;
570 mutex_exit(&mtm->mtm_mutex);
571 mof = savmof;
572 nb = savnb;
573 goto again;
574 }
575 *mep = me->me_hash;
576 me->me_next->me_prev = me->me_prev;
577 me->me_prev->me_next = me->me_next;
578 me->me_flags &= ~(ME_HASH|ME_ROLL);
579 ASSERT(!(me->me_flags & ME_USER));
580 mtm->mtm_nme--;
581 /*
582 * cancelled entries are handled by someone else
583 */
584 if ((me->me_flags & ME_CANCEL) == 0) {
585 roll_stats[me->me_dt]++;
586 CRB_RELE(me);
587 kmem_cache_free(mapentry_cache, me);
588 }
589 } else
590 mep = &me->me_hash;
591 }
592 }
593 mutex_exit(&mtm->mtm_mutex);
594
595 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
596 map_check_linkage(mtm));
597
598 if (dolock)
599 rw_exit(&mtm->mtm_rwlock);
600 }
601
602 /*
603 * Find the disk offset of the next delta to roll.
604 * Returns 0: no more deltas to roll or a transaction is being committed
605 * 1: a delta to roll has been found and *mofp points
606 * to the master file disk offset
607 */
608 int
logmap_next_roll(mt_map_t * logmap,offset_t * mofp)609 logmap_next_roll(mt_map_t *logmap, offset_t *mofp)
610 {
611 mapentry_t *me;
612
613 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
614 map_check_linkage(logmap));
615
616 mutex_enter(&logmap->mtm_mutex);
617 for (me = logmap->mtm_next; me != (mapentry_t *)logmap;
618 me = me->me_next) {
619 /* already rolled */
620 if (me->me_flags & ME_ROLL) {
621 continue;
622 }
623
624 /* part of currently busy transaction; stop */
625 if (me->me_tid == logmap->mtm_tid) {
626 break;
627 }
628
629 /* part of commit-in-progress transaction; stop */
630 if (me->me_tid == logmap->mtm_committid) {
631 break;
632 }
633
634 /*
635 * We shouldn't see a DT_CANCEL mapentry whose
636 * tid != mtm_committid, or != mtm_tid since
637 * these are removed at the end of each committed
638 * transaction.
639 */
640 ASSERT(!(me->me_dt == DT_CANCEL));
641
642 *mofp = me->me_mof;
643 mutex_exit(&logmap->mtm_mutex);
644 return (1);
645 }
646 mutex_exit(&logmap->mtm_mutex);
647 return (0);
648 }
649
650 /*
651 * put mapentry on sorted age list
652 */
653 static void
logmap_list_age(mapentry_t ** age,mapentry_t * meadd)654 logmap_list_age(mapentry_t **age, mapentry_t *meadd)
655 {
656 mapentry_t *me;
657
658 ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST)));
659
660 for (me = *age; me; age = &me->me_agenext, me = *age) {
661 if (me->me_age > meadd->me_age)
662 break;
663 }
664 meadd->me_agenext = me;
665 meadd->me_flags |= ME_AGE;
666 *age = meadd;
667 }
668
669 /*
670 * get a list of deltas within <mof, mof+nb>
671 * returns with mtm_rwlock held
672 * return value says whether the entire mof range is covered by deltas
673 */
674 int
logmap_list_get(mt_map_t * mtm,offset_t mof,off_t nb,mapentry_t ** age)675 logmap_list_get(
676 mt_map_t *mtm,
677 offset_t mof,
678 off_t nb,
679 mapentry_t **age)
680 {
681 off_t hnb;
682 mapentry_t *me;
683 mapentry_t **mep;
684 int rwtype = RW_READER;
685 offset_t savmof = mof;
686 off_t savnb = nb;
687 int entire = 0;
688 crb_t *crb;
689
690 mtm->mtm_ref = 1;
691 again:
692
693 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
694 map_check_linkage(mtm));
695
696 rw_enter(&mtm->mtm_rwlock, rwtype);
697 *age = NULL;
698 mutex_enter(&mtm->mtm_mutex);
699 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
700 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
701 if (hnb > nb)
702 hnb = nb;
703 /*
704 * find overlapping entries
705 */
706 mep = MAP_HASH(mof, mtm);
707 for (me = *mep; me; me = me->me_hash) {
708 if (me->me_dt == DT_CANCEL)
709 continue;
710 if (!DATAoverlapME(mof, hnb, me))
711 continue;
712 /*
713 * check if map entry is in use
714 * (about to be rolled).
715 */
716 if (me->me_flags & ME_AGE) {
717 /*
718 * reset the age bit in the list,
719 * upgrade the lock, and try again
720 */
721 for (me = *age; me; me = *age) {
722 *age = me->me_agenext;
723 me->me_flags &= ~ME_AGE;
724 }
725 mutex_exit(&mtm->mtm_mutex);
726 rw_exit(&mtm->mtm_rwlock);
727 rwtype = RW_WRITER;
728 mof = savmof;
729 nb = savnb;
730 entire = 0;
731 goto again;
732 } else {
733 /* add mapentry to age ordered list */
734 logmap_list_age(age, me);
735 crb = me->me_crb;
736 if (crb) {
737 if (DATAwithinCRB(savmof, savnb, crb)) {
738 entire = 1;
739 }
740 } else {
741 if (DATAwithinME(savmof, savnb, me)) {
742 entire = 1;
743 }
744 }
745 }
746 }
747 }
748 mutex_exit(&mtm->mtm_mutex);
749
750 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
751 return (entire);
752 }
753
754 /*
755 * Get a list of deltas for rolling - returns sucess or failure.
756 * Also return the cached roll buffer if all deltas point to it.
757 */
758 int
logmap_list_get_roll(mt_map_t * logmap,offset_t mof,rollbuf_t * rbp)759 logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp)
760 {
761 mapentry_t *me, **mep, *age = NULL;
762 crb_t *crb = NULL;
763
764 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
765 ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
766 map_check_linkage(logmap));
767 ASSERT((mof & MAPBLOCKOFF) == 0);
768
769 rbp->rb_crb = NULL;
770
771 /*
772 * find overlapping entries
773 */
774 mutex_enter(&logmap->mtm_mutex);
775 mep = MAP_HASH(mof, logmap);
776 for (me = *mep; me; me = me->me_hash) {
777 if (!DATAoverlapME(mof, MAPBLOCKSIZE, me))
778 continue;
779 if (me->me_tid == logmap->mtm_tid)
780 continue;
781 if (me->me_tid == logmap->mtm_committid)
782 continue;
783 if (me->me_dt == DT_CANCEL)
784 continue;
785
786 /*
787 * Check if map entry is in use (by lufs_read_strategy())
788 * and if so reset the age bit in the list,
789 * upgrade the lock, and try again
790 */
791 if (me->me_flags & ME_AGE) {
792 for (me = age; me; me = age) {
793 age = me->me_agenext;
794 me->me_flags &= ~ME_AGE;
795 }
796 mutex_exit(&logmap->mtm_mutex);
797 return (1); /* failure */
798 } else {
799 /* add mapentry to age ordered list */
800 logmap_list_age(&age, me);
801 }
802 }
803 if (!age) {
804 goto out;
805 }
806
807 /*
808 * Mark the deltas as being rolled.
809 */
810 for (me = age; me; me = me->me_agenext) {
811 me->me_flags |= ME_ROLL;
812 }
813
814 /*
815 * Test if all deltas are covered by one valid roll buffer
816 */
817 crb = age->me_crb;
818 if (crb && !(crb->c_invalid)) {
819 for (me = age; me; me = me->me_agenext) {
820 if (me->me_crb != crb) {
821 crb = NULL;
822 break;
823 }
824 }
825 rbp->rb_crb = crb;
826 }
827 out:
828 rbp->rb_age = age;
829
830 mutex_exit(&logmap->mtm_mutex);
831
832 ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) ||
833 logmap_logscan_debug(logmap, age));
834 ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
835 return (0); /* success */
836 }
837
838 void
logmap_list_put_roll(mt_map_t * mtm,mapentry_t * age)839 logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age)
840 {
841 mapentry_t *me;
842
843 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
844 mutex_enter(&mtm->mtm_mutex);
845 for (me = age; me; me = age) {
846 age = me->me_agenext;
847 me->me_flags &= ~ME_AGE;
848 }
849 mutex_exit(&mtm->mtm_mutex);
850 }
851
852 void
logmap_list_put(mt_map_t * mtm,mapentry_t * age)853 logmap_list_put(mt_map_t *mtm, mapentry_t *age)
854 {
855 mapentry_t *me;
856
857 ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
858 mutex_enter(&mtm->mtm_mutex);
859 for (me = age; me; me = age) {
860 age = me->me_agenext;
861 me->me_flags &= ~ME_AGE;
862 }
863 mutex_exit(&mtm->mtm_mutex);
864 rw_exit(&mtm->mtm_rwlock);
865 }
866
867 #define UFS_RW_BALANCE 2
868 int ufs_rw_balance = UFS_RW_BALANCE;
869
870 /*
871 * Check if we need to read the master.
872 * The master does not need to be read if the log deltas to the
873 * block are for one contiguous set of full disk sectors.
874 * Both cylinder group bit maps DT_CG (8K); directory entries (512B);
875 * and possibly others should not require master disk reads.
876 * Calculate the sector map for writing later.
877 */
878 int
logmap_setup_read(mapentry_t * age,rollbuf_t * rbp)879 logmap_setup_read(mapentry_t *age, rollbuf_t *rbp)
880 {
881 offset_t mof;
882 crb_t *crb;
883 mapentry_t *me;
884 int32_t nb;
885 int i;
886 int start_sec, end_sec;
887 int read_needed = 0;
888 int all_inodes = 1;
889 int first_sec = INT_MAX;
890 int last_sec = -1;
891 rbsecmap_t secmap = 0;
892
893 /* LINTED: warning: logical expression always true: op "||" */
894 ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY));
895
896 for (me = age; me; me = me->me_agenext) {
897 crb = me->me_crb;
898 if (crb) {
899 nb = crb->c_nb;
900 mof = crb->c_mof;
901 } else {
902 nb = me->me_nb;
903 mof = me->me_mof;
904 }
905
906 /*
907 * If the delta is not sector aligned then
908 * read the whole block.
909 */
910 if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) {
911 read_needed = 1;
912 }
913
914 /* Set sector map used in the MAPBLOCKSIZE block. */
915 start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT;
916 end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT);
917 for (i = start_sec; i <= end_sec; i++) {
918 secmap |= UINT16_C(1) << i;
919 }
920
921 if (me->me_dt != DT_INODE) {
922 all_inodes = 0;
923 }
924 if (start_sec < first_sec) {
925 first_sec = start_sec;
926 }
927 if (end_sec > last_sec) {
928 last_sec = end_sec;
929 }
930 }
931
932 ASSERT(secmap);
933 ASSERT(first_sec != INT_MAX);
934 ASSERT(last_sec != -1);
935
936 if (all_inodes) {
937 /*
938 * Here we have a tradeoff choice. It must be better to
939 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a
940 * read and a write. But what about 3 or more writes, versus
941 * a read+write? * Where is the cut over? It will depend on
942 * the track caching, scsi driver and other activity.
943 * A unpublished tunable is defined (ufs_rw_balance) that
944 * currently defaults to 2.
945 */
946 if (!read_needed) {
947 int count = 0, gap = 0;
948 int sector_set; /* write needed to this sector */
949
950 /* Count the gaps (every 1 to 0 transation) */
951 for (i = first_sec + 1; i < last_sec; i++) {
952 sector_set = secmap & (UINT16_C(1) << i);
953 if (!gap && !sector_set) {
954 gap = 1;
955 count++;
956 if (count > ufs_rw_balance) {
957 read_needed = 1;
958 break;
959 }
960 } else if (gap && sector_set) {
961 gap = 0;
962 }
963 }
964 }
965
966 /*
967 * Inodes commonly make up the majority (~85%) of deltas.
968 * They cannot contain embedded user data, so its safe to
969 * read and write them all in one IO.
970 * But for directory entries, shadow inode data, and
971 * quota record data the user data fragments can be embedded
972 * betwen those metadata, and so its not safe to read, modify
973 * then write the entire range as user asynchronous user data
974 * writes could get overwritten with old data.
975 * Thus we have to create a segment map of meta data that
976 * needs to get written.
977 *
978 * If user data was logged then this issue would go away.
979 */
980 if (read_needed) {
981 for (i = first_sec + 1; i < last_sec; i++) {
982 secmap |= (UINT16_C(1) << i);
983 }
984 }
985 }
986 rbp->rb_secmap = secmap;
987 return (read_needed);
988 }
989
990 /*
991 * Abort the load of a set of log map delta's.
992 * ie,
993 * Clear out all mapentries on this unit's log map
994 * which have a tid (transaction id) equal to the
995 * parameter tid. Walk the cancel list, taking everything
996 * off it, too.
997 */
998 static void
logmap_abort(ml_unit_t * ul,uint32_t tid)999 logmap_abort(ml_unit_t *ul, uint32_t tid)
1000 {
1001 struct mt_map *mtm = ul->un_logmap; /* Log map */
1002 mapentry_t *me, **mep;
1003 int i;
1004
1005 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1006 map_check_linkage(mtm));
1007
1008 /*
1009 * wait for any outstanding reads to finish; lock out future reads
1010 */
1011 rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1012
1013 mutex_enter(&mtm->mtm_mutex);
1014 /* Take everything off cancel list */
1015 while ((me = mtm->mtm_cancel) != NULL) {
1016 mtm->mtm_cancel = me->me_cancel;
1017 me->me_flags &= ~ME_CANCEL;
1018 me->me_cancel = NULL;
1019 }
1020
1021 /*
1022 * Now take out all mapentries with current tid, and committid
1023 * as this function is called from logmap_logscan and logmap_commit
1024 * When it is called from logmap_logscan mtm_tid == mtm_committid
1025 * But when logmap_abort is called from logmap_commit it is
1026 * because the log errored when trying to write the commit record,
1027 * after the async ops have been allowed to start in top_end_sync.
1028 * So we also need to remove all mapentries from the transaction whose
1029 * commit failed.
1030 */
1031 for (i = 0; i < mtm->mtm_nhash; i++) {
1032 mep = &mtm->mtm_hash[i];
1033 while ((me = *mep) != NULL) {
1034 if (me->me_tid == tid ||
1035 me->me_tid == mtm->mtm_committid) {
1036 *mep = me->me_hash;
1037 me->me_next->me_prev = me->me_prev;
1038 me->me_prev->me_next = me->me_next;
1039 if (!(me->me_flags & ME_USER)) {
1040 mtm->mtm_nme--;
1041 }
1042 CRB_RELE(me);
1043 kmem_cache_free(mapentry_cache, me);
1044 continue;
1045 }
1046 mep = &me->me_hash;
1047 }
1048 }
1049
1050 if (!(ul->un_flags & LDL_SCAN))
1051 mtm->mtm_flags |= MTM_CANCELED;
1052 mutex_exit(&mtm->mtm_mutex);
1053 mtm->mtm_dirty = 0;
1054 mtm->mtm_nmet = 0;
1055 rw_exit(&mtm->mtm_rwlock);
1056
1057 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1058 map_check_linkage(mtm));
1059 }
1060
1061 static void
logmap_wait_space(mt_map_t * mtm,ml_unit_t * ul,mapentry_t * me)1062 logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me)
1063 {
1064 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1065
1066 while (!ldl_has_space(ul, me)) {
1067 ASSERT(!(ul->un_flags & LDL_NOROLL));
1068 mutex_exit(&ul->un_log_mutex);
1069 logmap_forceroll(mtm);
1070 mutex_enter(&ul->un_log_mutex);
1071 if (ul->un_flags & LDL_ERROR)
1072 break;
1073 }
1074
1075 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1076 }
1077
1078 /*
1079 * put a list of deltas into a logmap
1080 * If va == NULL, don't write to the log.
1081 */
1082 void
logmap_add(ml_unit_t * ul,char * va,offset_t vamof,mapentry_t * melist)1083 logmap_add(
1084 ml_unit_t *ul,
1085 char *va, /* Ptr to buf w/deltas & data */
1086 offset_t vamof, /* Offset on master of buf start */
1087 mapentry_t *melist) /* Entries to add */
1088 {
1089 offset_t mof;
1090 off_t nb;
1091 mapentry_t *me;
1092 mapentry_t **mep;
1093 mapentry_t **savmep;
1094 uint32_t tid;
1095 mt_map_t *mtm = ul->un_logmap;
1096
1097 mutex_enter(&ul->un_log_mutex);
1098 if (va)
1099 logmap_wait_space(mtm, ul, melist);
1100
1101 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1102 map_check_linkage(mtm));
1103
1104 mtm->mtm_ref = 1;
1105 mtm->mtm_dirty++;
1106 tid = mtm->mtm_tid;
1107 while (melist) {
1108 mof = melist->me_mof;
1109 nb = melist->me_nb;
1110
1111 /*
1112 * search for overlaping entries
1113 */
1114 savmep = mep = MAP_HASH(mof, mtm);
1115 mutex_enter(&mtm->mtm_mutex);
1116 while ((me = *mep) != 0) {
1117 /*
1118 * Data consumes old map entry; cancel map entry.
1119 * Take care when we replace an old map entry
1120 * which carries quota information with a newer entry
1121 * which does not. In that case the push function
1122 * would not be called to clean up the dquot structure.
1123 * This would be found later by invalidatedq() causing
1124 * a panic when the filesystem in unmounted.
1125 * We clean up the dquot manually and then replace
1126 * the map entry.
1127 */
1128 if (MEwithinDATA(me, mof, nb) &&
1129 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1130 if (tid == me->me_tid &&
1131 ((me->me_flags & ME_AGE) == 0)) {
1132 *mep = me->me_hash;
1133 me->me_next->me_prev = me->me_prev;
1134 me->me_prev->me_next = me->me_next;
1135 ASSERT(!(me->me_flags & ME_USER));
1136 mtm->mtm_nme--;
1137 /*
1138 * Special case if the mapentry
1139 * carries a dquot and a push function.
1140 * We have to clean up the quota info
1141 * before replacing the mapentry.
1142 */
1143 if (me->me_dt == DT_QR)
1144 HANDLE_DQUOT(me, melist);
1145
1146 kmem_cache_free(mapentry_cache, me);
1147 continue;
1148 }
1149 me->me_cancel = mtm->mtm_cancel;
1150 mtm->mtm_cancel = me;
1151 me->me_flags |= ME_CANCEL;
1152 }
1153 mep = &(*mep)->me_hash;
1154 }
1155 mutex_exit(&mtm->mtm_mutex);
1156
1157 /*
1158 * remove from list
1159 */
1160 me = melist;
1161 melist = melist->me_hash;
1162 me->me_flags &= ~ME_LIST;
1163 /*
1164 * If va != NULL, put in the log.
1165 */
1166 if (va)
1167 ldl_write(ul, va, vamof, me);
1168 if (ul->un_flags & LDL_ERROR) {
1169 kmem_cache_free(mapentry_cache, me);
1170 continue;
1171 }
1172 ASSERT((va == NULL) ||
1173 ((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1174 map_check_ldl_write(ul, va, vamof, me));
1175
1176 /*
1177 * put on hash
1178 */
1179 mutex_enter(&mtm->mtm_mutex);
1180 me->me_hash = *savmep;
1181 *savmep = me;
1182 me->me_next = (mapentry_t *)mtm;
1183 me->me_prev = mtm->mtm_prev;
1184 mtm->mtm_prev->me_next = me;
1185 mtm->mtm_prev = me;
1186 me->me_flags |= ME_HASH;
1187 me->me_tid = tid;
1188 me->me_age = mtm->mtm_age++;
1189 mtm->mtm_nme++;
1190 mtm->mtm_nmet++;
1191 mutex_exit(&mtm->mtm_mutex);
1192 }
1193
1194 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1195 map_check_linkage(mtm));
1196 mutex_exit(&ul->un_log_mutex);
1197 }
1198
1199 /*
1200 * Add the delta(s) into the log.
1201 * Create one cached roll buffer logmap entry, and reference count the
1202 * number of mapentries refering to it.
1203 * Cancel previous logmap entries.
1204 * logmap_add is tolerant of failure to allocate a cached roll buffer.
1205 */
1206 void
logmap_add_buf(ml_unit_t * ul,char * va,offset_t bufmof,mapentry_t * melist,caddr_t buf,uint32_t bufsz)1207 logmap_add_buf(
1208 ml_unit_t *ul,
1209 char *va, /* Ptr to buf w/deltas & data */
1210 offset_t bufmof, /* Offset on master of buf start */
1211 mapentry_t *melist, /* Entries to add */
1212 caddr_t buf, /* Buffer containing delta(s) */
1213 uint32_t bufsz) /* Size of buf */
1214 {
1215 offset_t mof;
1216 offset_t vamof = bufmof + (va - buf);
1217 off_t nb;
1218 mapentry_t *me;
1219 mapentry_t **mep;
1220 mapentry_t **savmep;
1221 uint32_t tid;
1222 mt_map_t *mtm = ul->un_logmap;
1223 crb_t *crb;
1224 crb_t *crbsav = NULL;
1225
1226 ASSERT((bufsz & DEV_BMASK) == 0);
1227 mutex_enter(&ul->un_log_mutex);
1228 logmap_wait_space(mtm, ul, melist);
1229
1230 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1231 map_check_linkage(mtm));
1232
1233 mtm->mtm_ref = 1;
1234 mtm->mtm_dirty++;
1235 tid = mtm->mtm_tid;
1236 while (melist) {
1237 mof = melist->me_mof;
1238 nb = melist->me_nb;
1239
1240 /*
1241 * search for overlapping entries
1242 */
1243 savmep = mep = MAP_HASH(mof, mtm);
1244 mutex_enter(&mtm->mtm_mutex);
1245 while ((me = *mep) != 0) {
1246 /*
1247 * Data consumes old map entry; cancel map entry.
1248 * Take care when we replace an old map entry
1249 * which carries quota information with a newer entry
1250 * which does not. In that case the push function
1251 * would not be called to clean up the dquot structure.
1252 * This would be found later by invalidatedq() causing
1253 * a panic when the filesystem in unmounted.
1254 * We clean up the dquot manually and then replace
1255 * the map entry.
1256 */
1257 crb = me->me_crb;
1258 if (MEwithinDATA(me, mof, nb) &&
1259 ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
1260 if (tid == me->me_tid &&
1261 ((me->me_flags & ME_AGE) == 0)) {
1262 *mep = me->me_hash;
1263 me->me_next->me_prev = me->me_prev;
1264 me->me_prev->me_next = me->me_next;
1265 ASSERT(!(me->me_flags & ME_USER));
1266 mtm->mtm_nme--;
1267 /*
1268 * Special case if the mapentry
1269 * carries a dquot and a push function.
1270 * We have to clean up the quota info
1271 * before replacing the mapentry.
1272 */
1273 if (me->me_dt == DT_QR)
1274 HANDLE_DQUOT(me, melist);
1275
1276 /*
1277 * If this soon to be deleted mapentry
1278 * has a suitable roll buffer then
1279 * re-use it.
1280 */
1281 if (crb && (--crb->c_refcnt == 0)) {
1282 if (crbsav ||
1283 (crb->c_nb != bufsz)) {
1284 CRB_FREE(crb, me);
1285 } else {
1286 bcopy(buf, crb->c_buf,
1287 bufsz);
1288 crb->c_invalid = 0;
1289 crb->c_mof = bufmof;
1290 crbsav = crb;
1291 me->me_crb = NULL;
1292 }
1293 }
1294 kmem_cache_free(mapentry_cache, me);
1295 continue;
1296 }
1297 me->me_cancel = mtm->mtm_cancel;
1298 mtm->mtm_cancel = me;
1299 me->me_flags |= ME_CANCEL;
1300 }
1301
1302 /*
1303 * Inode deltas within the same fs block come
1304 * in individually as separate calls to logmap_add().
1305 * All others come in as one call. So check for an
1306 * existing entry where we can re-use the crb.
1307 */
1308 if ((me->me_dt == DT_INODE) && (tid == me->me_tid) &&
1309 !crbsav && crb &&
1310 WITHIN(mof, nb, crb->c_mof, crb->c_nb)) {
1311 ASSERT(crb->c_mof == bufmof);
1312 ASSERT(crb->c_nb == bufsz);
1313 bcopy(buf, crb->c_buf, bufsz);
1314 crbsav = crb;
1315 }
1316 mep = &(*mep)->me_hash;
1317 }
1318 mutex_exit(&mtm->mtm_mutex);
1319
1320 /*
1321 * If we don't already have a crb then allocate one
1322 * and copy the incoming buffer. Only do this once
1323 * for all the incoming deltas.
1324 */
1325 if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) {
1326 /*
1327 * Only use a cached roll buffer if we
1328 * have enough memory, and check for failures.
1329 */
1330 if (((ufs_crb_size + bufsz) < ufs_crb_limit) &&
1331 (kmem_avail() > bufsz)) {
1332 crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP);
1333 } else {
1334 ufs_crb_alloc_fails++;
1335 }
1336 if (crbsav) {
1337 crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP);
1338 if (crbsav->c_buf) {
1339 atomic_add_64(&ufs_crb_size,
1340 (uint64_t)bufsz);
1341 if (ufs_crb_size > ufs_crb_max_size) {
1342 ufs_crb_max_size = ufs_crb_size;
1343 }
1344 bcopy(buf, crbsav->c_buf, bufsz);
1345 crbsav->c_nb = bufsz;
1346 crbsav->c_refcnt = 0;
1347 crbsav->c_invalid = 0;
1348 ASSERT((bufmof & DEV_BMASK) == 0);
1349 crbsav->c_mof = bufmof;
1350 } else {
1351 kmem_free(crbsav, sizeof (crb_t));
1352 crbsav = NULL;
1353 }
1354 }
1355 }
1356
1357 /*
1358 * remove from list
1359 */
1360 me = melist;
1361 melist = melist->me_hash;
1362 me->me_flags &= ~ME_LIST;
1363 me->me_crb = crbsav;
1364 if (crbsav) {
1365 crbsav->c_refcnt++;
1366 }
1367 crbsav = NULL;
1368
1369 ASSERT(va);
1370 ldl_write(ul, va, vamof, me); /* add to on-disk log */
1371 if (ul->un_flags & LDL_ERROR) {
1372 CRB_RELE(me);
1373 kmem_cache_free(mapentry_cache, me);
1374 continue;
1375 }
1376 ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
1377 map_check_ldl_write(ul, va, vamof, me));
1378
1379 /*
1380 * put on hash
1381 */
1382 mutex_enter(&mtm->mtm_mutex);
1383 me->me_hash = *savmep;
1384 *savmep = me;
1385 me->me_next = (mapentry_t *)mtm;
1386 me->me_prev = mtm->mtm_prev;
1387 mtm->mtm_prev->me_next = me;
1388 mtm->mtm_prev = me;
1389 me->me_flags |= ME_HASH;
1390 me->me_tid = tid;
1391 me->me_age = mtm->mtm_age++;
1392 mtm->mtm_nme++;
1393 mtm->mtm_nmet++;
1394 mutex_exit(&mtm->mtm_mutex);
1395 }
1396
1397 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1398 map_check_linkage(mtm));
1399 mutex_exit(&ul->un_log_mutex);
1400 }
1401
1402 /*
1403 * free up any cancelled deltas
1404 */
1405 void
logmap_free_cancel(mt_map_t * mtm,mapentry_t ** cancelhead)1406 logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead)
1407 {
1408 int dolock = 0;
1409 mapentry_t *me;
1410 mapentry_t **mep;
1411
1412 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1413 map_check_linkage(mtm));
1414
1415 again:
1416 if (dolock)
1417 rw_enter(&mtm->mtm_rwlock, RW_WRITER);
1418
1419 /*
1420 * At EOT, cancel the indicated deltas
1421 */
1422 mutex_enter(&mtm->mtm_mutex);
1423 if (mtm->mtm_flags & MTM_CANCELED) {
1424 mtm->mtm_flags &= ~MTM_CANCELED;
1425 ASSERT(dolock == 0);
1426 mutex_exit(&mtm->mtm_mutex);
1427 return;
1428 }
1429
1430 while ((me = *cancelhead) != NULL) {
1431 /*
1432 * roll forward or read collision; wait and try again
1433 */
1434 if (me->me_flags & ME_AGE) {
1435 ASSERT(dolock == 0);
1436 mutex_exit(&mtm->mtm_mutex);
1437 dolock = 1;
1438 goto again;
1439 }
1440 /*
1441 * remove from cancel list
1442 */
1443 *cancelhead = me->me_cancel;
1444 me->me_cancel = NULL;
1445 me->me_flags &= ~(ME_CANCEL);
1446
1447 /*
1448 * logmap_remove_roll handles ME_ROLL entries later
1449 * we leave them around for logmap_iscancel
1450 * XXX is this necessary?
1451 */
1452 if (me->me_flags & ME_ROLL)
1453 continue;
1454
1455 /*
1456 * remove from hash (if necessary)
1457 */
1458 if (me->me_flags & ME_HASH) {
1459 mep = MAP_HASH(me->me_mof, mtm);
1460 while (*mep) {
1461 if (*mep == me) {
1462 *mep = me->me_hash;
1463 me->me_next->me_prev = me->me_prev;
1464 me->me_prev->me_next = me->me_next;
1465 me->me_flags &= ~(ME_HASH);
1466 if (!(me->me_flags & ME_USER)) {
1467 mtm->mtm_nme--;
1468 }
1469 break;
1470 } else
1471 mep = &(*mep)->me_hash;
1472 }
1473 }
1474 /*
1475 * put the entry on the free list
1476 */
1477 CRB_RELE(me);
1478 kmem_cache_free(mapentry_cache, me);
1479 }
1480 mutex_exit(&mtm->mtm_mutex);
1481 if (dolock)
1482 rw_exit(&mtm->mtm_rwlock);
1483
1484 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1485 map_check_linkage(mtm));
1486 }
1487
1488
1489 void
logmap_commit(ml_unit_t * ul,uint32_t tid)1490 logmap_commit(ml_unit_t *ul, uint32_t tid)
1491 {
1492 mapentry_t me;
1493 mt_map_t *mtm = ul->un_logmap;
1494
1495
1496 ASSERT(MUTEX_HELD(&ul->un_log_mutex));
1497
1498 /*
1499 * async'ly write a commit rec into the log
1500 */
1501 if (mtm->mtm_dirty) {
1502 /*
1503 * put commit record into log
1504 */
1505 me.me_mof = mtm->mtm_tid;
1506 me.me_dt = DT_COMMIT;
1507 me.me_nb = 0;
1508 me.me_hash = NULL;
1509 logmap_wait_space(mtm, ul, &me);
1510 ldl_write(ul, NULL, (offset_t)0, &me);
1511 ldl_round_commit(ul);
1512
1513 /*
1514 * abort on error; else reset dirty flag
1515 */
1516 if (ul->un_flags & LDL_ERROR)
1517 logmap_abort(ul, tid);
1518 else {
1519 mtm->mtm_dirty = 0;
1520 mtm->mtm_nmet = 0;
1521 mtm->mtm_cfrags = 0;
1522 }
1523 /* push commit */
1524 ldl_push_commit(ul);
1525 }
1526 }
1527
1528 void
logmap_sethead(mt_map_t * mtm,ml_unit_t * ul)1529 logmap_sethead(mt_map_t *mtm, ml_unit_t *ul)
1530 {
1531 off_t lof;
1532 uint32_t tid;
1533 mapentry_t *me;
1534
1535 /*
1536 * move the head forward so the log knows how full it is
1537 * Make sure to skip any mapentry whose me_lof is 0, these
1538 * are just place holders for DT_CANCELED freed user blocks
1539 * for the current moby.
1540 */
1541 mutex_enter(&ul->un_log_mutex);
1542 mutex_enter(&mtm->mtm_mutex);
1543 me = mtm->mtm_next;
1544 while (me != (mapentry_t *)mtm && me->me_lof == 0) {
1545 me = me->me_next;
1546 }
1547
1548 if (me == (mapentry_t *)mtm)
1549 lof = -1;
1550 else {
1551 lof = me->me_lof;
1552 tid = me->me_tid;
1553 }
1554 mutex_exit(&mtm->mtm_mutex);
1555 ldl_sethead(ul, lof, tid);
1556 if (lof == -1)
1557 mtm->mtm_age = 0;
1558 mutex_exit(&ul->un_log_mutex);
1559 }
1560
1561 void
logmap_settail(mt_map_t * mtm,ml_unit_t * ul)1562 logmap_settail(mt_map_t *mtm, ml_unit_t *ul)
1563 {
1564 off_t lof;
1565 size_t nb;
1566
1567 /*
1568 * set the tail after the logmap_abort
1569 */
1570 mutex_enter(&ul->un_log_mutex);
1571 mutex_enter(&mtm->mtm_mutex);
1572 if (mtm->mtm_prev == (mapentry_t *)mtm)
1573 lof = -1;
1574 else {
1575 /*
1576 * set the tail to the end of the last commit
1577 */
1578 lof = mtm->mtm_tail_lof;
1579 nb = mtm->mtm_tail_nb;
1580 }
1581 mutex_exit(&mtm->mtm_mutex);
1582 ldl_settail(ul, lof, nb);
1583 mutex_exit(&ul->un_log_mutex);
1584 }
1585
1586 /*
1587 * when reseting a device; roll the log until every
1588 * delta has been rolled forward
1589 */
1590 void
logmap_roll_dev(ml_unit_t * ul)1591 logmap_roll_dev(ml_unit_t *ul)
1592 {
1593 mt_map_t *mtm = ul->un_logmap;
1594 mapentry_t *me;
1595 ufsvfs_t *ufsvfsp = ul->un_ufsvfs;
1596
1597 again:
1598 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1599 map_check_linkage(mtm));
1600 if (ul->un_flags & (LDL_ERROR|LDL_NOROLL))
1601 return;
1602
1603 /*
1604 * look for deltas
1605 */
1606 mutex_enter(&mtm->mtm_mutex);
1607 for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) {
1608 if (me->me_flags & ME_ROLL)
1609 break;
1610 if (me->me_tid == mtm->mtm_tid)
1611 continue;
1612 if (me->me_tid == mtm->mtm_committid)
1613 continue;
1614 break;
1615 }
1616
1617 /*
1618 * found a delta; kick the roll thread
1619 * but only if the thread is running... (jmh)
1620 */
1621 if (me != (mapentry_t *)mtm) {
1622 mutex_exit(&mtm->mtm_mutex);
1623 logmap_forceroll(mtm);
1624 goto again;
1625 }
1626
1627 /*
1628 * no more deltas, return
1629 */
1630 mutex_exit(&mtm->mtm_mutex);
1631 (void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs);
1632
1633 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1634 map_check_linkage(mtm));
1635 }
1636
1637 static void
logmap_cancel_delta(ml_unit_t * ul,offset_t mof,int32_t nb,int metadata)1638 logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata)
1639 {
1640 mapentry_t *me;
1641 mapentry_t **mep;
1642 mt_map_t *mtm = ul->un_logmap;
1643 int frags;
1644
1645 /*
1646 * map has been referenced and is dirty
1647 */
1648 mtm->mtm_ref = 1;
1649 mtm->mtm_dirty++;
1650
1651 /*
1652 * get a mapentry
1653 */
1654 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1655 bzero(me, sizeof (mapentry_t));
1656
1657 /*
1658 * initialize cancel record and put in logmap
1659 */
1660 me->me_mof = mof;
1661 me->me_nb = nb;
1662 me->me_dt = DT_CANCEL;
1663 me->me_tid = mtm->mtm_tid;
1664 me->me_hash = NULL;
1665
1666 /*
1667 * Write delta to log if this delta is for metadata. If this is not
1668 * metadata it is user data and we are just putting a cancel
1669 * mapentry into the hash to cancel a user block deletion
1670 * in which we do not want the block to be allocated
1671 * within this moby. This cancel entry will prevent the block from
1672 * being allocated within the moby and prevent user data corruption
1673 * if we happen to crash before this moby is committed.
1674 */
1675 mutex_enter(&ul->un_log_mutex);
1676 if (metadata) {
1677 logmap_wait_space(mtm, ul, me);
1678 ldl_write(ul, NULL, (offset_t)0, me);
1679 if (ul->un_flags & LDL_ERROR) {
1680 kmem_cache_free(mapentry_cache, me);
1681 mutex_exit(&ul->un_log_mutex);
1682 return;
1683 }
1684 }
1685
1686 /*
1687 * put in hash and on cancel list
1688 */
1689 mep = MAP_HASH(mof, mtm);
1690 mutex_enter(&mtm->mtm_mutex);
1691 me->me_age = mtm->mtm_age++;
1692 me->me_hash = *mep;
1693 *mep = me;
1694 me->me_next = (mapentry_t *)mtm;
1695 me->me_prev = mtm->mtm_prev;
1696 mtm->mtm_prev->me_next = me;
1697 mtm->mtm_prev = me;
1698 me->me_cancel = mtm->mtm_cancel;
1699 mtm->mtm_cancel = me;
1700 if (metadata) {
1701 mtm->mtm_nme++;
1702 mtm->mtm_nmet++;
1703 } else {
1704 me->me_flags = ME_USER;
1705 }
1706 me->me_flags |= (ME_HASH|ME_CANCEL);
1707 if (!(metadata)) {
1708 frags = blkoff(ul->un_ufsvfs->vfs_fs, nb);
1709 if (frags)
1710 mtm->mtm_cfrags +=
1711 numfrags(ul->un_ufsvfs->vfs_fs, frags);
1712 }
1713 mutex_exit(&mtm->mtm_mutex);
1714
1715 mutex_exit(&ul->un_log_mutex);
1716 }
1717
1718 /*
1719 * cancel entries in a logmap (entries are freed at EOT)
1720 */
1721 void
logmap_cancel(ml_unit_t * ul,offset_t mof,off_t nb,int metadata)1722 logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata)
1723 {
1724 int32_t hnb;
1725 mapentry_t *me;
1726 mapentry_t **mep;
1727 mt_map_t *mtm = ul->un_logmap;
1728 crb_t *crb;
1729
1730 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1731 map_check_linkage(mtm));
1732
1733 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1734 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1735 if (hnb > nb)
1736 hnb = nb;
1737 /*
1738 * Find overlapping metadata entries. Don't search through
1739 * the hash chains if this is user data because it is only
1740 * possible to have overlapping map entries for metadata,
1741 * and the search can become expensive for large files.
1742 */
1743 if (metadata) {
1744 mep = MAP_HASH(mof, mtm);
1745 mutex_enter(&mtm->mtm_mutex);
1746 for (me = *mep; me; me = me->me_hash) {
1747 if (!DATAoverlapME(mof, hnb, me))
1748 continue;
1749
1750 ASSERT(MEwithinDATA(me, mof, hnb));
1751
1752 if ((me->me_flags & ME_CANCEL) == 0) {
1753 me->me_cancel = mtm->mtm_cancel;
1754 mtm->mtm_cancel = me;
1755 me->me_flags |= ME_CANCEL;
1756 crb = me->me_crb;
1757 if (crb) {
1758 crb->c_invalid = 1;
1759 }
1760 }
1761 }
1762 mutex_exit(&mtm->mtm_mutex);
1763 }
1764
1765 /*
1766 * put a cancel record into the log
1767 */
1768 logmap_cancel_delta(ul, mof, hnb, metadata);
1769 }
1770
1771 ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
1772 map_check_linkage(mtm));
1773 }
1774
1775 /*
1776 * check for overlap w/cancel delta
1777 */
1778 int
logmap_iscancel(mt_map_t * mtm,offset_t mof,off_t nb)1779 logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb)
1780 {
1781 off_t hnb;
1782 mapentry_t *me;
1783 mapentry_t **mep;
1784
1785 mutex_enter(&mtm->mtm_mutex);
1786 for (hnb = 0; nb; nb -= hnb, mof += hnb) {
1787 hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
1788 if (hnb > nb)
1789 hnb = nb;
1790 /*
1791 * search for dup entry
1792 */
1793 mep = MAP_HASH(mof, mtm);
1794 for (me = *mep; me; me = me->me_hash) {
1795 if (((me->me_flags & ME_ROLL) == 0) &&
1796 (me->me_dt != DT_CANCEL))
1797 continue;
1798 if (DATAoverlapME(mof, hnb, me))
1799 break;
1800 }
1801
1802 /*
1803 * overlap detected
1804 */
1805 if (me) {
1806 mutex_exit(&mtm->mtm_mutex);
1807 return (1);
1808 }
1809 }
1810 mutex_exit(&mtm->mtm_mutex);
1811 return (0);
1812 }
1813
1814 static int
logmap_logscan_add(ml_unit_t * ul,struct delta * dp,off_t lof,size_t * nbp)1815 logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp)
1816 {
1817 mapentry_t *me;
1818 int error;
1819 mt_map_t *mtm = ul->un_logmap;
1820
1821 /*
1822 * verify delta header; failure == mediafail
1823 */
1824 error = 0;
1825 /* delta type */
1826 if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX))
1827 error = EINVAL;
1828 if (dp->d_typ == DT_COMMIT) {
1829 if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1))
1830 error = EINVAL;
1831 } else {
1832 /* length of delta */
1833 if ((dp->d_nb < INT32_C(0)) ||
1834 (dp->d_nb > INT32_C(MAPBLOCKSIZE)))
1835 error = EINVAL;
1836
1837 /* offset on master device */
1838 if (dp->d_mof < INT64_C(0))
1839 error = EINVAL;
1840 }
1841
1842 if (error) {
1843 ldl_seterror(ul, "Error processing ufs log data during scan");
1844 return (error);
1845 }
1846
1847 /*
1848 * process commit record
1849 */
1850 if (dp->d_typ == DT_COMMIT) {
1851 if (mtm->mtm_dirty) {
1852 ASSERT(dp->d_nb == INT32_C(0));
1853 logmap_free_cancel(mtm, &mtm->mtm_cancel);
1854 mtm->mtm_dirty = 0;
1855 mtm->mtm_nmet = 0;
1856 mtm->mtm_tid++;
1857 mtm->mtm_committid = mtm->mtm_tid;
1858 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1859 logmap_logscan_commit_debug(lof, mtm));
1860 }
1861 /*
1862 * return #bytes to next sector (next delta header)
1863 */
1864 *nbp = ldl_logscan_nbcommit(lof);
1865 mtm->mtm_tail_lof = lof;
1866 mtm->mtm_tail_nb = *nbp;
1867 return (0);
1868 }
1869
1870 /*
1871 * add delta to logmap
1872 */
1873 me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
1874 bzero(me, sizeof (mapentry_t));
1875 me->me_lof = lof;
1876 me->me_mof = dp->d_mof;
1877 me->me_nb = dp->d_nb;
1878 me->me_tid = mtm->mtm_tid;
1879 me->me_dt = dp->d_typ;
1880 me->me_hash = NULL;
1881 me->me_flags = (ME_LIST | ME_SCAN);
1882 logmap_add(ul, NULL, 0, me);
1883 switch (dp->d_typ) {
1884 case DT_CANCEL:
1885 me->me_flags |= ME_CANCEL;
1886 me->me_cancel = mtm->mtm_cancel;
1887 mtm->mtm_cancel = me;
1888 break;
1889 default:
1890 ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
1891 logmap_logscan_add_debug(dp, mtm));
1892 break;
1893 }
1894
1895 sizeofdelta:
1896 /*
1897 * return #bytes till next delta header
1898 */
1899 if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO))
1900 *nbp = 0;
1901 else
1902 *nbp = dp->d_nb;
1903 return (0);
1904 }
1905
1906 void
logmap_logscan(ml_unit_t * ul)1907 logmap_logscan(ml_unit_t *ul)
1908 {
1909 size_t nb, nbd;
1910 off_t lof;
1911 struct delta delta;
1912 mt_map_t *logmap = ul->un_logmap;
1913
1914 ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap);
1915
1916 /*
1917 * prepare the log for a logscan
1918 */
1919 ldl_logscan_begin(ul);
1920
1921 /*
1922 * prepare the logmap for a logscan
1923 */
1924 (void) map_free_entries(logmap);
1925 logmap->mtm_tid = 0;
1926 logmap->mtm_committid = UINT32_C(0);
1927 logmap->mtm_age = 0;
1928 logmap->mtm_dirty = 0;
1929 logmap->mtm_ref = 0;
1930
1931 /*
1932 * while not at end of log
1933 * read delta header
1934 * add to logmap
1935 * seek to beginning of next delta
1936 */
1937 lof = ul->un_head_lof;
1938 nbd = sizeof (delta);
1939 while (lof != ul->un_tail_lof) {
1940
1941 /* read delta header */
1942 if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta))
1943 break;
1944
1945 /* add to logmap */
1946 if (logmap_logscan_add(ul, &delta, lof, &nb))
1947 break;
1948
1949 /* seek to next header (skip data) */
1950 if (ldl_logscan_read(ul, &lof, nb, NULL))
1951 break;
1952 }
1953
1954 /*
1955 * remove the last partial transaction from the logmap
1956 */
1957 logmap_abort(ul, logmap->mtm_tid);
1958
1959 ldl_logscan_end(ul);
1960 }
1961
1962 void
_init_map(void)1963 _init_map(void)
1964 {
1965 /*
1966 * Initialise the mapentry cache. No constructor or deconstructor
1967 * is needed. Also no reclaim function is supplied as reclaiming
1968 * current entries is not possible.
1969 */
1970 mapentry_cache = kmem_cache_create("lufs_mapentry_cache",
1971 sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
1972 }
1973
1974 /*
1975 * Special case when we replace an old map entry which carries quota
1976 * information with a newer entry which does not.
1977 * In that case the push function would not be called to clean up the
1978 * dquot structure. This would be found later by invalidatedq() causing
1979 * a panic when the filesystem in unmounted.
1980 * We clean up the dquot manually before replacing the map entry.
1981 */
1982 void
handle_dquot(mapentry_t * me)1983 handle_dquot(mapentry_t *me)
1984 {
1985 int dolock = 0;
1986 int domutex = 0;
1987 struct dquot *dqp;
1988
1989 dqp = (struct dquot *)me->me_arg;
1990
1991 /*
1992 * We need vfs_dqrwlock to call dqput()
1993 */
1994 dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock));
1995 if (dolock)
1996 rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER);
1997
1998 domutex = (!MUTEX_HELD(&dqp->dq_lock));
1999 if (domutex)
2000 mutex_enter(&dqp->dq_lock);
2001
2002 /*
2003 * Only clean up if the dquot is referenced
2004 */
2005 if (dqp->dq_cnt == 0) {
2006 if (domutex)
2007 mutex_exit(&dqp->dq_lock);
2008 if (dolock)
2009 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2010 return;
2011 }
2012
2013 dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS);
2014 dqput(dqp);
2015
2016 if (domutex)
2017 mutex_exit(&dqp->dq_lock);
2018
2019 if (dolock)
2020 rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
2021
2022 }
2023