1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Support for ephemeral mounts, e.g. mirror-mounts. These mounts are
29 * triggered from a "stub" rnode via a special set of vnodeops.
30 */
31
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/time.h>
37 #include <sys/vnode.h>
38 #include <sys/vfs.h>
39 #include <sys/vfs_opreg.h>
40 #include <sys/file.h>
41 #include <sys/filio.h>
42 #include <sys/uio.h>
43 #include <sys/buf.h>
44 #include <sys/mman.h>
45 #include <sys/pathname.h>
46 #include <sys/dirent.h>
47 #include <sys/debug.h>
48 #include <sys/vmsystm.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/swap.h>
52 #include <sys/errno.h>
53 #include <sys/strsubr.h>
54 #include <sys/sysmacros.h>
55 #include <sys/kmem.h>
56 #include <sys/mount.h>
57 #include <sys/cmn_err.h>
58 #include <sys/pathconf.h>
59 #include <sys/utsname.h>
60 #include <sys/dnlc.h>
61 #include <sys/acl.h>
62 #include <sys/systeminfo.h>
63 #include <sys/policy.h>
64 #include <sys/sdt.h>
65 #include <sys/list.h>
66 #include <sys/stat.h>
67 #include <sys/mntent.h>
68 #include <sys/priv.h>
69
70 #include <rpc/types.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73
74 #include <nfs/nfs.h>
75 #include <nfs/nfs_clnt.h>
76 #include <nfs/nfs_acl.h>
77 #include <nfs/lm.h>
78 #include <nfs/nfs4.h>
79 #include <nfs/nfs4_kprot.h>
80 #include <nfs/rnode4.h>
81 #include <nfs/nfs4_clnt.h>
82 #include <nfs/nfsid_map.h>
83 #include <nfs/nfs4_idmap_impl.h>
84
85 #include <vm/hat.h>
86 #include <vm/as.h>
87 #include <vm/page.h>
88 #include <vm/pvn.h>
89 #include <vm/seg.h>
90 #include <vm/seg_map.h>
91 #include <vm/seg_kpm.h>
92 #include <vm/seg_vn.h>
93
94 #include <fs/fs_subr.h>
95
96 #include <sys/ddi.h>
97 #include <sys/int_fmtio.h>
98
99 #include <sys/sunddi.h>
100
101 #include <sys/priv_names.h>
102
103 extern zone_key_t nfs4clnt_zone_key;
104 extern zone_key_t nfsidmap_zone_key;
105
106 /*
107 * The automatic unmounter thread stuff!
108 */
109 static int nfs4_trigger_thread_timer = 20; /* in seconds */
110
111 /*
112 * Just a default....
113 */
114 static uint_t nfs4_trigger_mount_to = 240;
115
116 typedef struct nfs4_trigger_globals {
117 kmutex_t ntg_forest_lock;
118 uint_t ntg_mount_to;
119 int ntg_thread_started;
120 nfs4_ephemeral_tree_t *ntg_forest;
121 } nfs4_trigger_globals_t;
122
123 kmutex_t nfs4_ephemeral_thread_lock;
124
125 zone_key_t nfs4_ephemeral_key = ZONE_KEY_UNINITIALIZED;
126
127 static void nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *);
128
129 /*
130 * Used for ephemeral mounts; contains data either duplicated from
131 * servinfo4_t, or hand-crafted, depending on type of ephemeral mount.
132 *
133 * It's intended that this structure is used solely for ephemeral
134 * mount-type specific data, for passing this data to
135 * nfs4_trigger_nargs_create().
136 */
137 typedef struct ephemeral_servinfo {
138 char *esi_hostname;
139 char *esi_netname;
140 char *esi_path;
141 int esi_path_len;
142 int esi_mount_flags;
143 struct netbuf *esi_addr;
144 struct netbuf *esi_syncaddr;
145 struct knetconfig *esi_knconf;
146 } ephemeral_servinfo_t;
147
148 /*
149 * Collect together the mount-type specific and generic data args.
150 */
151 typedef struct domount_args {
152 ephemeral_servinfo_t *dma_esi;
153 char *dma_hostlist; /* comma-sep. for RO failover */
154 struct nfs_args *dma_nargs;
155 } domount_args_t;
156
157
158 /*
159 * The vnode ops functions for a trigger stub vnode
160 */
161 static int nfs4_trigger_open(vnode_t **, int, cred_t *, caller_context_t *);
162 static int nfs4_trigger_getattr(vnode_t *, struct vattr *, int, cred_t *,
163 caller_context_t *);
164 static int nfs4_trigger_setattr(vnode_t *, struct vattr *, int, cred_t *,
165 caller_context_t *);
166 static int nfs4_trigger_access(vnode_t *, int, int, cred_t *,
167 caller_context_t *);
168 static int nfs4_trigger_readlink(vnode_t *, struct uio *, cred_t *,
169 caller_context_t *);
170 static int nfs4_trigger_lookup(vnode_t *, char *, vnode_t **,
171 struct pathname *, int, vnode_t *, cred_t *, caller_context_t *,
172 int *, pathname_t *);
173 static int nfs4_trigger_create(vnode_t *, char *, struct vattr *,
174 enum vcexcl, int, vnode_t **, cred_t *, int, caller_context_t *,
175 vsecattr_t *);
176 static int nfs4_trigger_remove(vnode_t *, char *, cred_t *, caller_context_t *,
177 int);
178 static int nfs4_trigger_link(vnode_t *, vnode_t *, char *, cred_t *,
179 caller_context_t *, int);
180 static int nfs4_trigger_rename(vnode_t *, char *, vnode_t *, char *,
181 cred_t *, caller_context_t *, int);
182 static int nfs4_trigger_mkdir(vnode_t *, char *, struct vattr *,
183 vnode_t **, cred_t *, caller_context_t *, int, vsecattr_t *vsecp);
184 static int nfs4_trigger_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
185 caller_context_t *, int);
186 static int nfs4_trigger_symlink(vnode_t *, char *, struct vattr *, char *,
187 cred_t *, caller_context_t *, int);
188 static int nfs4_trigger_cmp(vnode_t *, vnode_t *, caller_context_t *);
189
190 /*
191 * Regular NFSv4 vnodeops that we need to reference directly
192 */
193 extern int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
194 caller_context_t *);
195 extern void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
196 extern int nfs4_rwlock(vnode_t *, int, caller_context_t *);
197 extern void nfs4_rwunlock(vnode_t *, int, caller_context_t *);
198 extern int nfs4_lookup(vnode_t *, char *, vnode_t **,
199 struct pathname *, int, vnode_t *, cred_t *,
200 caller_context_t *, int *, pathname_t *);
201 extern int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
202 caller_context_t *);
203 extern int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
204 caller_context_t *);
205 extern int nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
206 extern int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
207
208 static int nfs4_trigger_mount(vnode_t *, cred_t *, vnode_t **);
209 static int nfs4_trigger_domount(vnode_t *, domount_args_t *, vfs_t **,
210 cred_t *, vnode_t **);
211 static domount_args_t *nfs4_trigger_domount_args_create(vnode_t *, cred_t *);
212 static void nfs4_trigger_domount_args_destroy(domount_args_t *dma,
213 vnode_t *vp);
214 static ephemeral_servinfo_t *nfs4_trigger_esi_create(vnode_t *, servinfo4_t *,
215 cred_t *);
216 static void nfs4_trigger_esi_destroy(ephemeral_servinfo_t *, vnode_t *);
217 static ephemeral_servinfo_t *nfs4_trigger_esi_create_mirrormount(vnode_t *,
218 servinfo4_t *);
219 static ephemeral_servinfo_t *nfs4_trigger_esi_create_referral(vnode_t *,
220 cred_t *);
221 static struct nfs_args *nfs4_trigger_nargs_create(mntinfo4_t *, servinfo4_t *,
222 ephemeral_servinfo_t *);
223 static void nfs4_trigger_nargs_destroy(struct nfs_args *);
224 static char *nfs4_trigger_create_mntopts(vfs_t *);
225 static void nfs4_trigger_destroy_mntopts(char *);
226 static int nfs4_trigger_add_mntopt(char *, char *, vfs_t *);
227 static enum clnt_stat nfs4_trigger_ping_server(servinfo4_t *, int);
228 static enum clnt_stat nfs4_ping_server_common(struct knetconfig *,
229 struct netbuf *, int);
230
231 extern int umount2_engine(vfs_t *, int, cred_t *, int);
232
233 vnodeops_t *nfs4_trigger_vnodeops;
234
235 /*
236 * These are the vnodeops that we must define for stub vnodes.
237 *
238 *
239 * Many of the VOPs defined for NFSv4 do not need to be defined here,
240 * for various reasons. This will result in the VFS default function being
241 * used:
242 *
243 * - These VOPs require a previous VOP_OPEN to have occurred. That will have
244 * lost the reference to the stub vnode, meaning these should not be called:
245 * close, read, write, ioctl, readdir, seek.
246 *
247 * - These VOPs are meaningless for vnodes without data pages. Since the
248 * stub vnode is of type VDIR, these should not be called:
249 * space, getpage, putpage, map, addmap, delmap, pageio, fsync.
250 *
251 * - These VOPs are otherwise not applicable, and should not be called:
252 * dump, setsecattr.
253 *
254 *
255 * These VOPs we do not want to define, but nor do we want the VFS default
256 * action. Instead, we specify the VFS error function, with fs_error(), but
257 * note that fs_error() is not actually called. Instead it results in the
258 * use of the error function defined for the particular VOP, in vn_ops_table[]:
259 *
260 * - frlock, dispose, shrlock.
261 *
262 *
263 * These VOPs we define to use the corresponding regular NFSv4 vnodeop.
264 * NOTE: if any of these ops involve an OTW call with the stub FH, then
265 * that call must be wrapped with save_mnt_secinfo()/check_mnt_secinfo()
266 * to protect the security data in the servinfo4_t for the "parent"
267 * filesystem that contains the stub.
268 *
269 * - These VOPs should not trigger a mount, so that "ls -l" does not:
270 * pathconf, getsecattr.
271 *
272 * - These VOPs would not make sense to trigger:
273 * inactive, rwlock, rwunlock, fid, realvp.
274 */
275 const fs_operation_def_t nfs4_trigger_vnodeops_template[] = {
276 VOPNAME_OPEN, { .vop_open = nfs4_trigger_open },
277 VOPNAME_GETATTR, { .vop_getattr = nfs4_trigger_getattr },
278 VOPNAME_SETATTR, { .vop_setattr = nfs4_trigger_setattr },
279 VOPNAME_ACCESS, { .vop_access = nfs4_trigger_access },
280 VOPNAME_LOOKUP, { .vop_lookup = nfs4_trigger_lookup },
281 VOPNAME_CREATE, { .vop_create = nfs4_trigger_create },
282 VOPNAME_REMOVE, { .vop_remove = nfs4_trigger_remove },
283 VOPNAME_LINK, { .vop_link = nfs4_trigger_link },
284 VOPNAME_RENAME, { .vop_rename = nfs4_trigger_rename },
285 VOPNAME_MKDIR, { .vop_mkdir = nfs4_trigger_mkdir },
286 VOPNAME_RMDIR, { .vop_rmdir = nfs4_trigger_rmdir },
287 VOPNAME_SYMLINK, { .vop_symlink = nfs4_trigger_symlink },
288 VOPNAME_READLINK, { .vop_readlink = nfs4_trigger_readlink },
289 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive },
290 VOPNAME_FID, { .vop_fid = nfs4_fid },
291 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock },
292 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock },
293 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp },
294 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr },
295 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf },
296 VOPNAME_FRLOCK, { .error = fs_error },
297 VOPNAME_DISPOSE, { .error = fs_error },
298 VOPNAME_SHRLOCK, { .error = fs_error },
299 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
300 NULL, NULL
301 };
302
303 static void
nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t * net)304 nfs4_ephemeral_tree_incr(nfs4_ephemeral_tree_t *net)
305 {
306 ASSERT(mutex_owned(&net->net_cnt_lock));
307 net->net_refcnt++;
308 ASSERT(net->net_refcnt != 0);
309 }
310
311 static void
nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t * net)312 nfs4_ephemeral_tree_hold(nfs4_ephemeral_tree_t *net)
313 {
314 mutex_enter(&net->net_cnt_lock);
315 nfs4_ephemeral_tree_incr(net);
316 mutex_exit(&net->net_cnt_lock);
317 }
318
319 /*
320 * We need a safe way to decrement the refcnt whilst the
321 * lock is being held.
322 */
323 static void
nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t * net)324 nfs4_ephemeral_tree_decr(nfs4_ephemeral_tree_t *net)
325 {
326 ASSERT(mutex_owned(&net->net_cnt_lock));
327 ASSERT(net->net_refcnt != 0);
328 net->net_refcnt--;
329 }
330
331 static void
nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t * net)332 nfs4_ephemeral_tree_rele(nfs4_ephemeral_tree_t *net)
333 {
334 mutex_enter(&net->net_cnt_lock);
335 nfs4_ephemeral_tree_decr(net);
336 mutex_exit(&net->net_cnt_lock);
337 }
338
339 /*
340 * Trigger ops for stub vnodes; for mirror mounts, etc.
341 *
342 * The general idea is that a "triggering" op will first call
343 * nfs4_trigger_mount(), which will find out whether a mount has already
344 * been triggered.
345 *
346 * If it has, then nfs4_trigger_mount() sets newvp to the root vnode
347 * of the covering vfs.
348 *
349 * If a mount has not yet been triggered, nfs4_trigger_mount() will do so,
350 * and again set newvp, as above.
351 *
352 * The triggering op may then re-issue the VOP by calling it on newvp.
353 *
354 * Note that some ops may perform custom action, and may or may not need
355 * to trigger a mount.
356 *
357 * Some ops need to call the regular NFSv4 vnodeop for a stub vnode. We
358 * obviously can't do this with VOP_<whatever>, since it's a stub vnode
359 * and that would just recurse. Instead, we call the v4 op directly,
360 * by name. This is OK, since we know that the vnode is for NFSv4,
361 * otherwise it couldn't be a stub.
362 *
363 */
364
365 static int
nfs4_trigger_open(vnode_t ** vpp,int flag,cred_t * cr,caller_context_t * ct)366 nfs4_trigger_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
367 {
368 int error;
369 vnode_t *newvp;
370
371 error = nfs4_trigger_mount(*vpp, cr, &newvp);
372 if (error)
373 return (error);
374
375 /* Release the stub vnode, as we're losing the reference to it */
376 VN_RELE(*vpp);
377
378 /* Give the caller the root vnode of the newly-mounted fs */
379 *vpp = newvp;
380
381 /* return with VN_HELD(newvp) */
382 return (VOP_OPEN(vpp, flag, cr, ct));
383 }
384
385 void
nfs4_fake_attrs(vnode_t * vp,struct vattr * vap)386 nfs4_fake_attrs(vnode_t *vp, struct vattr *vap)
387 {
388 uint_t mask;
389 timespec_t now;
390
391 /*
392 * Set some attributes here for referrals.
393 */
394 mask = vap->va_mask;
395 bzero(vap, sizeof (struct vattr));
396 vap->va_mask = mask;
397 vap->va_uid = 0;
398 vap->va_gid = 0;
399 vap->va_nlink = 1;
400 vap->va_size = 1;
401 gethrestime(&now);
402 vap->va_atime = now;
403 vap->va_mtime = now;
404 vap->va_ctime = now;
405 vap->va_type = VDIR;
406 vap->va_mode = 0555;
407 vap->va_fsid = vp->v_vfsp->vfs_dev;
408 vap->va_rdev = 0;
409 vap->va_blksize = MAXBSIZE;
410 vap->va_nblocks = 1;
411 vap->va_seq = 0;
412 }
413
414 /*
415 * For the majority of cases, nfs4_trigger_getattr() will not trigger
416 * a mount. However, if ATTR_TRIGGER is set, we are being informed
417 * that we need to force the mount before we attempt to determine
418 * the attributes. The intent is an atomic operation for security
419 * testing.
420 *
421 * If we're not triggering a mount, we can still inquire about the
422 * actual attributes from the server in the mirror mount case,
423 * and will return manufactured attributes for a referral (see
424 * the 'create' branch of find_referral_stubvp()).
425 */
426 static int
nfs4_trigger_getattr(vnode_t * vp,struct vattr * vap,int flags,cred_t * cr,caller_context_t * ct)427 nfs4_trigger_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
428 caller_context_t *ct)
429 {
430 int error;
431
432 if (flags & ATTR_TRIGGER) {
433 vnode_t *newvp;
434
435 error = nfs4_trigger_mount(vp, cr, &newvp);
436 if (error)
437 return (error);
438
439 error = VOP_GETATTR(newvp, vap, flags, cr, ct);
440 VN_RELE(newvp);
441
442 } else if (RP_ISSTUB_MIRRORMOUNT(VTOR4(vp))) {
443
444 error = nfs4_getattr(vp, vap, flags, cr, ct);
445
446 } else if (RP_ISSTUB_REFERRAL(VTOR4(vp))) {
447
448 nfs4_fake_attrs(vp, vap);
449 error = 0;
450 }
451
452 return (error);
453 }
454
455 static int
nfs4_trigger_setattr(vnode_t * vp,struct vattr * vap,int flags,cred_t * cr,caller_context_t * ct)456 nfs4_trigger_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
457 caller_context_t *ct)
458 {
459 int error;
460 vnode_t *newvp;
461
462 error = nfs4_trigger_mount(vp, cr, &newvp);
463 if (error)
464 return (error);
465
466 error = VOP_SETATTR(newvp, vap, flags, cr, ct);
467 VN_RELE(newvp);
468
469 return (error);
470 }
471
472 static int
nfs4_trigger_access(vnode_t * vp,int mode,int flags,cred_t * cr,caller_context_t * ct)473 nfs4_trigger_access(vnode_t *vp, int mode, int flags, cred_t *cr,
474 caller_context_t *ct)
475 {
476 int error;
477 vnode_t *newvp;
478
479 error = nfs4_trigger_mount(vp, cr, &newvp);
480 if (error)
481 return (error);
482
483 error = VOP_ACCESS(newvp, mode, flags, cr, ct);
484 VN_RELE(newvp);
485
486 return (error);
487 }
488
489 static int
nfs4_trigger_lookup(vnode_t * dvp,char * nm,vnode_t ** vpp,struct pathname * pnp,int flags,vnode_t * rdir,cred_t * cr,caller_context_t * ct,int * deflags,pathname_t * rpnp)490 nfs4_trigger_lookup(vnode_t *dvp, char *nm, vnode_t **vpp,
491 struct pathname *pnp, int flags, vnode_t *rdir, cred_t *cr,
492 caller_context_t *ct, int *deflags, pathname_t *rpnp)
493 {
494 int error;
495 vnode_t *newdvp;
496 rnode4_t *drp = VTOR4(dvp);
497
498 ASSERT(RP_ISSTUB(drp));
499
500 /*
501 * It's not legal to lookup ".." for an fs root, so we mustn't pass
502 * that up. Instead, pass onto the regular op, regardless of whether
503 * we've triggered a mount.
504 */
505 if (strcmp(nm, "..") == 0)
506 if (RP_ISSTUB_MIRRORMOUNT(drp)) {
507 return (nfs4_lookup(dvp, nm, vpp, pnp, flags, rdir, cr,
508 ct, deflags, rpnp));
509 } else if (RP_ISSTUB_REFERRAL(drp)) {
510 /* Return the parent vnode */
511 return (vtodv(dvp, vpp, cr, TRUE));
512 }
513
514 error = nfs4_trigger_mount(dvp, cr, &newdvp);
515 if (error)
516 return (error);
517
518 error = VOP_LOOKUP(newdvp, nm, vpp, pnp, flags, rdir, cr, ct,
519 deflags, rpnp);
520 VN_RELE(newdvp);
521
522 return (error);
523 }
524
525 static int
nfs4_trigger_create(vnode_t * dvp,char * nm,struct vattr * va,enum vcexcl exclusive,int mode,vnode_t ** vpp,cred_t * cr,int flags,caller_context_t * ct,vsecattr_t * vsecp)526 nfs4_trigger_create(vnode_t *dvp, char *nm, struct vattr *va,
527 enum vcexcl exclusive, int mode, vnode_t **vpp, cred_t *cr,
528 int flags, caller_context_t *ct, vsecattr_t *vsecp)
529 {
530 int error;
531 vnode_t *newdvp;
532
533 error = nfs4_trigger_mount(dvp, cr, &newdvp);
534 if (error)
535 return (error);
536
537 error = VOP_CREATE(newdvp, nm, va, exclusive, mode, vpp, cr,
538 flags, ct, vsecp);
539 VN_RELE(newdvp);
540
541 return (error);
542 }
543
544 static int
nfs4_trigger_remove(vnode_t * dvp,char * nm,cred_t * cr,caller_context_t * ct,int flags)545 nfs4_trigger_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct,
546 int flags)
547 {
548 int error;
549 vnode_t *newdvp;
550
551 error = nfs4_trigger_mount(dvp, cr, &newdvp);
552 if (error)
553 return (error);
554
555 error = VOP_REMOVE(newdvp, nm, cr, ct, flags);
556 VN_RELE(newdvp);
557
558 return (error);
559 }
560
561 static int
nfs4_trigger_link(vnode_t * tdvp,vnode_t * svp,char * tnm,cred_t * cr,caller_context_t * ct,int flags)562 nfs4_trigger_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
563 caller_context_t *ct, int flags)
564 {
565 int error;
566 vnode_t *newtdvp;
567
568 error = nfs4_trigger_mount(tdvp, cr, &newtdvp);
569 if (error)
570 return (error);
571
572 /*
573 * We don't check whether svp is a stub. Let the NFSv4 code
574 * detect that error, and return accordingly.
575 */
576 error = VOP_LINK(newtdvp, svp, tnm, cr, ct, flags);
577 VN_RELE(newtdvp);
578
579 return (error);
580 }
581
582 static int
nfs4_trigger_rename(vnode_t * sdvp,char * snm,vnode_t * tdvp,char * tnm,cred_t * cr,caller_context_t * ct,int flags)583 nfs4_trigger_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
584 cred_t *cr, caller_context_t *ct, int flags)
585 {
586 int error;
587 vnode_t *newsdvp;
588 rnode4_t *tdrp = VTOR4(tdvp);
589
590 /*
591 * We know that sdvp is a stub, otherwise we would not be here.
592 *
593 * If tdvp is also be a stub, there are two possibilities: it
594 * is either the same stub as sdvp [i.e. VN_CMP(sdvp, tdvp)]
595 * or it is a different stub [!VN_CMP(sdvp, tdvp)].
596 *
597 * In the former case, just trigger sdvp, and treat tdvp as
598 * though it were not a stub.
599 *
600 * In the latter case, it might be a different stub for the
601 * same server fs as sdvp, or for a different server fs.
602 * Regardless, from the client perspective this would still
603 * be a cross-filesystem rename, and should not be allowed,
604 * so return EXDEV, without triggering either mount.
605 */
606 if (RP_ISSTUB(tdrp) && !VN_CMP(sdvp, tdvp))
607 return (EXDEV);
608
609 error = nfs4_trigger_mount(sdvp, cr, &newsdvp);
610 if (error)
611 return (error);
612
613 error = VOP_RENAME(newsdvp, snm, tdvp, tnm, cr, ct, flags);
614
615 VN_RELE(newsdvp);
616
617 return (error);
618 }
619
620 /* ARGSUSED */
621 static int
nfs4_trigger_mkdir(vnode_t * dvp,char * nm,struct vattr * va,vnode_t ** vpp,cred_t * cr,caller_context_t * ct,int flags,vsecattr_t * vsecp)622 nfs4_trigger_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp,
623 cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp)
624 {
625 int error;
626 vnode_t *newdvp;
627
628 error = nfs4_trigger_mount(dvp, cr, &newdvp);
629 if (error)
630 return (error);
631
632 error = VOP_MKDIR(newdvp, nm, va, vpp, cr, ct, flags, vsecp);
633 VN_RELE(newdvp);
634
635 return (error);
636 }
637
638 static int
nfs4_trigger_rmdir(vnode_t * dvp,char * nm,vnode_t * cdir,cred_t * cr,caller_context_t * ct,int flags)639 nfs4_trigger_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
640 caller_context_t *ct, int flags)
641 {
642 int error;
643 vnode_t *newdvp;
644
645 error = nfs4_trigger_mount(dvp, cr, &newdvp);
646 if (error)
647 return (error);
648
649 error = VOP_RMDIR(newdvp, nm, cdir, cr, ct, flags);
650 VN_RELE(newdvp);
651
652 return (error);
653 }
654
655 static int
nfs4_trigger_symlink(vnode_t * dvp,char * lnm,struct vattr * tva,char * tnm,cred_t * cr,caller_context_t * ct,int flags)656 nfs4_trigger_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm,
657 cred_t *cr, caller_context_t *ct, int flags)
658 {
659 int error;
660 vnode_t *newdvp;
661
662 error = nfs4_trigger_mount(dvp, cr, &newdvp);
663 if (error)
664 return (error);
665
666 error = VOP_SYMLINK(newdvp, lnm, tva, tnm, cr, ct, flags);
667 VN_RELE(newdvp);
668
669 return (error);
670 }
671
672 static int
nfs4_trigger_readlink(vnode_t * vp,struct uio * uiop,cred_t * cr,caller_context_t * ct)673 nfs4_trigger_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr,
674 caller_context_t *ct)
675 {
676 int error;
677 vnode_t *newvp;
678
679 error = nfs4_trigger_mount(vp, cr, &newvp);
680 if (error)
681 return (error);
682
683 error = VOP_READLINK(newvp, uiop, cr, ct);
684 VN_RELE(newvp);
685
686 return (error);
687 }
688
689 /* end of trigger vnode ops */
690
691 /*
692 * See if the mount has already been done by another caller.
693 */
694 static int
nfs4_trigger_mounted_already(vnode_t * vp,vnode_t ** newvpp,bool_t * was_mounted,vfs_t ** vfsp)695 nfs4_trigger_mounted_already(vnode_t *vp, vnode_t **newvpp,
696 bool_t *was_mounted, vfs_t **vfsp)
697 {
698 int error;
699 mntinfo4_t *mi = VTOMI4(vp);
700
701 *was_mounted = FALSE;
702
703 error = vn_vfsrlock_wait(vp);
704 if (error)
705 return (error);
706
707 *vfsp = vn_mountedvfs(vp);
708 if (*vfsp != NULL) {
709 /* the mount has already occurred */
710 error = VFS_ROOT(*vfsp, newvpp);
711 if (!error) {
712 /* need to update the reference time */
713 mutex_enter(&mi->mi_lock);
714 if (mi->mi_ephemeral)
715 mi->mi_ephemeral->ne_ref_time =
716 gethrestime_sec();
717 mutex_exit(&mi->mi_lock);
718
719 *was_mounted = TRUE;
720 }
721 }
722
723 vn_vfsunlock(vp);
724 return (0);
725 }
726
727 /*
728 * Mount upon a trigger vnode; for mirror-mounts, referrals, etc.
729 *
730 * The mount may have already occurred, via another thread. If not,
731 * assemble the location information - which may require fetching - and
732 * perform the mount.
733 *
734 * Sets newvp to be the root of the fs that is now covering vp. Note
735 * that we return with VN_HELD(*newvp).
736 *
737 * The caller is responsible for passing the VOP onto the covering fs.
738 */
739 static int
nfs4_trigger_mount(vnode_t * vp,cred_t * cr,vnode_t ** newvpp)740 nfs4_trigger_mount(vnode_t *vp, cred_t *cr, vnode_t **newvpp)
741 {
742 int error;
743 vfs_t *vfsp;
744 rnode4_t *rp = VTOR4(vp);
745 mntinfo4_t *mi = VTOMI4(vp);
746 domount_args_t *dma;
747
748 nfs4_ephemeral_tree_t *net;
749
750 bool_t must_unlock = FALSE;
751 bool_t is_building = FALSE;
752 bool_t was_mounted = FALSE;
753
754 cred_t *mcred = NULL;
755
756 nfs4_trigger_globals_t *ntg;
757
758 zone_t *zone = curproc->p_zone;
759
760 ASSERT(RP_ISSTUB(rp));
761
762 *newvpp = NULL;
763
764 /*
765 * Has the mount already occurred?
766 */
767 error = nfs4_trigger_mounted_already(vp, newvpp,
768 &was_mounted, &vfsp);
769 if (error || was_mounted)
770 goto done;
771
772 ntg = zone_getspecific(nfs4_ephemeral_key, zone);
773 ASSERT(ntg != NULL);
774
775 mutex_enter(&mi->mi_lock);
776
777 /*
778 * We need to lock down the ephemeral tree.
779 */
780 if (mi->mi_ephemeral_tree == NULL) {
781 net = kmem_zalloc(sizeof (*net), KM_SLEEP);
782 mutex_init(&net->net_tree_lock, NULL, MUTEX_DEFAULT, NULL);
783 mutex_init(&net->net_cnt_lock, NULL, MUTEX_DEFAULT, NULL);
784 net->net_refcnt = 1;
785 net->net_status = NFS4_EPHEMERAL_TREE_BUILDING;
786 is_building = TRUE;
787
788 /*
789 * We need to add it to the zone specific list for
790 * automatic unmounting and harvesting of deadwood.
791 */
792 mutex_enter(&ntg->ntg_forest_lock);
793 if (ntg->ntg_forest != NULL)
794 net->net_next = ntg->ntg_forest;
795 ntg->ntg_forest = net;
796 mutex_exit(&ntg->ntg_forest_lock);
797
798 /*
799 * No lock order confusion with mi_lock because no
800 * other node could have grabbed net_tree_lock.
801 */
802 mutex_enter(&net->net_tree_lock);
803 mi->mi_ephemeral_tree = net;
804 net->net_mount = mi;
805 mutex_exit(&mi->mi_lock);
806
807 MI4_HOLD(mi);
808 VFS_HOLD(mi->mi_vfsp);
809 } else {
810 net = mi->mi_ephemeral_tree;
811 nfs4_ephemeral_tree_hold(net);
812
813 mutex_exit(&mi->mi_lock);
814
815 mutex_enter(&net->net_tree_lock);
816
817 /*
818 * We can only procede if the tree is neither locked
819 * nor being torn down.
820 */
821 mutex_enter(&net->net_cnt_lock);
822 if (net->net_status & NFS4_EPHEMERAL_TREE_PROCESSING) {
823 nfs4_ephemeral_tree_decr(net);
824 mutex_exit(&net->net_cnt_lock);
825 mutex_exit(&net->net_tree_lock);
826
827 return (EIO);
828 }
829 mutex_exit(&net->net_cnt_lock);
830 }
831
832 mutex_enter(&net->net_cnt_lock);
833 net->net_status |= NFS4_EPHEMERAL_TREE_MOUNTING;
834 mutex_exit(&net->net_cnt_lock);
835
836 must_unlock = TRUE;
837
838 dma = nfs4_trigger_domount_args_create(vp, cr);
839 if (dma == NULL) {
840 error = EINVAL;
841 goto done;
842 }
843
844 /*
845 * Note that since we define mirror mounts to work
846 * for any user, we simply extend the privileges of
847 * the user's credentials to allow the mount to
848 * proceed.
849 */
850 mcred = crdup(cr);
851 if (mcred == NULL) {
852 error = EINVAL;
853 goto done;
854 }
855
856 crset_zone_privall(mcred);
857 if (is_system_labeled())
858 (void) setpflags(NET_MAC_AWARE, 1, mcred);
859
860 error = nfs4_trigger_domount(vp, dma, &vfsp, mcred, newvpp);
861 nfs4_trigger_domount_args_destroy(dma, vp);
862
863 DTRACE_PROBE2(nfs4clnt__func__referral__mount,
864 vnode_t *, vp, int, error);
865
866 crfree(mcred);
867
868 done:
869
870 if (must_unlock) {
871 mutex_enter(&net->net_cnt_lock);
872 net->net_status &= ~NFS4_EPHEMERAL_TREE_MOUNTING;
873
874 /*
875 * REFCNT: If we are the root of the tree, then we need
876 * to keep a reference because we malloced the tree and
877 * this is where we tied it to our mntinfo.
878 *
879 * If we are not the root of the tree, then our tie to
880 * the mntinfo occured elsewhere and we need to
881 * decrement the reference to the tree.
882 */
883 if (is_building)
884 net->net_status &= ~NFS4_EPHEMERAL_TREE_BUILDING;
885 else
886 nfs4_ephemeral_tree_decr(net);
887 mutex_exit(&net->net_cnt_lock);
888
889 mutex_exit(&net->net_tree_lock);
890 }
891
892 if (!error && (newvpp == NULL || *newvpp == NULL))
893 error = ENOSYS;
894
895 return (error);
896 }
897
898 /*
899 * Collect together both the generic & mount-type specific args.
900 */
901 static domount_args_t *
nfs4_trigger_domount_args_create(vnode_t * vp,cred_t * cr)902 nfs4_trigger_domount_args_create(vnode_t *vp, cred_t *cr)
903 {
904 int nointr;
905 char *hostlist;
906 servinfo4_t *svp;
907 struct nfs_args *nargs, *nargs_head;
908 enum clnt_stat status;
909 ephemeral_servinfo_t *esi, *esi_first;
910 domount_args_t *dma;
911 mntinfo4_t *mi = VTOMI4(vp);
912
913 nointr = !(mi->mi_flags & MI4_INT);
914 hostlist = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
915
916 svp = mi->mi_curr_serv;
917 /* check if the current server is responding */
918 status = nfs4_trigger_ping_server(svp, nointr);
919 if (status == RPC_SUCCESS) {
920 esi_first = nfs4_trigger_esi_create(vp, svp, cr);
921 if (esi_first == NULL) {
922 kmem_free(hostlist, MAXPATHLEN);
923 return (NULL);
924 }
925
926 (void) strlcpy(hostlist, esi_first->esi_hostname, MAXPATHLEN);
927
928 nargs_head = nfs4_trigger_nargs_create(mi, svp, esi_first);
929 } else {
930 /* current server did not respond */
931 esi_first = NULL;
932 nargs_head = NULL;
933 }
934 nargs = nargs_head;
935
936 /*
937 * NFS RO failover.
938 *
939 * If we have multiple servinfo4 structures, linked via sv_next,
940 * we must create one nfs_args for each, linking the nfs_args via
941 * nfs_ext_u.nfs_extB.next.
942 *
943 * We need to build a corresponding esi for each, too, but that is
944 * used solely for building nfs_args, and may be immediately
945 * discarded, as domount() requires the info from just one esi,
946 * but all the nfs_args.
947 *
948 * Currently, the NFS mount code will hang if not all servers
949 * requested are available. To avoid that, we need to ping each
950 * server, here, and remove it from the list if it is not
951 * responding. This has the side-effect of that server then
952 * being permanently unavailable for this failover mount, even if
953 * it recovers. That's unfortunate, but the best we can do until
954 * the mount code path is fixed.
955 */
956
957 /*
958 * If the current server was down, loop indefinitely until we find
959 * at least one responsive server.
960 */
961 do {
962 /* no locking needed for sv_next; it is only set at fs mount */
963 for (svp = mi->mi_servers; svp != NULL; svp = svp->sv_next) {
964 struct nfs_args *next;
965
966 /*
967 * nargs_head: the head of the nfs_args list
968 * nargs: the current tail of the list
969 * next: the newly-created element to be added
970 */
971
972 /*
973 * We've already tried the current server, above;
974 * if it was responding, we have already included it
975 * and it may now be ignored.
976 *
977 * Otherwise, try it again, since it may now have
978 * recovered.
979 */
980 if (svp == mi->mi_curr_serv && esi_first != NULL)
981 continue;
982
983 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
984 if (svp->sv_flags & SV4_NOTINUSE) {
985 nfs_rw_exit(&svp->sv_lock);
986 continue;
987 }
988 nfs_rw_exit(&svp->sv_lock);
989
990 /* check if the server is responding */
991 status = nfs4_trigger_ping_server(svp, nointr);
992 /* if the server did not respond, ignore it */
993 if (status != RPC_SUCCESS)
994 continue;
995
996 esi = nfs4_trigger_esi_create(vp, svp, cr);
997 if (esi == NULL)
998 continue;
999
1000 /*
1001 * If the original current server (mi_curr_serv)
1002 * was down when when we first tried it,
1003 * (i.e. esi_first == NULL),
1004 * we select this new server (svp) to be the server
1005 * that we will actually contact (esi_first).
1006 *
1007 * Note that it's possible that mi_curr_serv == svp,
1008 * if that mi_curr_serv was down but has now recovered.
1009 */
1010 next = nfs4_trigger_nargs_create(mi, svp, esi);
1011 if (esi_first == NULL) {
1012 ASSERT(nargs == NULL);
1013 ASSERT(nargs_head == NULL);
1014 nargs_head = next;
1015 esi_first = esi;
1016 (void) strlcpy(hostlist,
1017 esi_first->esi_hostname, MAXPATHLEN);
1018 } else {
1019 ASSERT(nargs_head != NULL);
1020 nargs->nfs_ext_u.nfs_extB.next = next;
1021 (void) strlcat(hostlist, ",", MAXPATHLEN);
1022 (void) strlcat(hostlist, esi->esi_hostname,
1023 MAXPATHLEN);
1024 /* esi was only needed for hostname & nargs */
1025 nfs4_trigger_esi_destroy(esi, vp);
1026 }
1027
1028 nargs = next;
1029 }
1030
1031 /* if we've had no response at all, wait a second */
1032 if (esi_first == NULL)
1033 delay(drv_usectohz(1000000));
1034
1035 } while (esi_first == NULL);
1036 ASSERT(nargs_head != NULL);
1037
1038 dma = kmem_zalloc(sizeof (domount_args_t), KM_SLEEP);
1039 dma->dma_esi = esi_first;
1040 dma->dma_hostlist = hostlist;
1041 dma->dma_nargs = nargs_head;
1042
1043 return (dma);
1044 }
1045
1046 static void
nfs4_trigger_domount_args_destroy(domount_args_t * dma,vnode_t * vp)1047 nfs4_trigger_domount_args_destroy(domount_args_t *dma, vnode_t *vp)
1048 {
1049 if (dma != NULL) {
1050 if (dma->dma_esi != NULL && vp != NULL)
1051 nfs4_trigger_esi_destroy(dma->dma_esi, vp);
1052
1053 if (dma->dma_hostlist != NULL)
1054 kmem_free(dma->dma_hostlist, MAXPATHLEN);
1055
1056 if (dma->dma_nargs != NULL) {
1057 struct nfs_args *nargs = dma->dma_nargs;
1058
1059 do {
1060 struct nfs_args *next =
1061 nargs->nfs_ext_u.nfs_extB.next;
1062
1063 nfs4_trigger_nargs_destroy(nargs);
1064 nargs = next;
1065 } while (nargs != NULL);
1066 }
1067
1068 kmem_free(dma, sizeof (domount_args_t));
1069 }
1070 }
1071
1072 /*
1073 * The ephemeral_servinfo_t struct contains basic information we will need to
1074 * perform the mount. Whilst the structure is generic across different
1075 * types of ephemeral mount, the way we gather its contents differs.
1076 */
1077 static ephemeral_servinfo_t *
nfs4_trigger_esi_create(vnode_t * vp,servinfo4_t * svp,cred_t * cr)1078 nfs4_trigger_esi_create(vnode_t *vp, servinfo4_t *svp, cred_t *cr)
1079 {
1080 ephemeral_servinfo_t *esi;
1081 rnode4_t *rp = VTOR4(vp);
1082
1083 ASSERT(RP_ISSTUB(rp));
1084
1085 /* Call the ephemeral type-specific routine */
1086 if (RP_ISSTUB_MIRRORMOUNT(rp))
1087 esi = nfs4_trigger_esi_create_mirrormount(vp, svp);
1088 else if (RP_ISSTUB_REFERRAL(rp))
1089 esi = nfs4_trigger_esi_create_referral(vp, cr);
1090 else
1091 esi = NULL;
1092 return (esi);
1093 }
1094
1095 static void
nfs4_trigger_esi_destroy(ephemeral_servinfo_t * esi,vnode_t * vp)1096 nfs4_trigger_esi_destroy(ephemeral_servinfo_t *esi, vnode_t *vp)
1097 {
1098 rnode4_t *rp = VTOR4(vp);
1099
1100 ASSERT(RP_ISSTUB(rp));
1101
1102 /* Currently, no need for an ephemeral type-specific routine */
1103
1104 /*
1105 * The contents of ephemeral_servinfo_t goes into nfs_args,
1106 * and will be handled by nfs4_trigger_nargs_destroy().
1107 * We need only free the structure itself.
1108 */
1109 if (esi != NULL)
1110 kmem_free(esi, sizeof (ephemeral_servinfo_t));
1111 }
1112
1113 /*
1114 * Some of this may turn out to be common with other ephemeral types,
1115 * in which case it should be moved to nfs4_trigger_esi_create(), or a
1116 * common function called.
1117 */
1118
1119 /*
1120 * Mirror mounts case - should have all data available
1121 */
1122 static ephemeral_servinfo_t *
nfs4_trigger_esi_create_mirrormount(vnode_t * vp,servinfo4_t * svp)1123 nfs4_trigger_esi_create_mirrormount(vnode_t *vp, servinfo4_t *svp)
1124 {
1125 char *stubpath;
1126 struct knetconfig *sikncp, *svkncp;
1127 struct netbuf *bufp;
1128 ephemeral_servinfo_t *esi;
1129
1130 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1131
1132 /* initially set to be our type of ephemeral mount; may be added to */
1133 esi->esi_mount_flags = NFSMNT_MIRRORMOUNT;
1134
1135 /*
1136 * We're copying info from the stub rnode's servinfo4, but
1137 * we must create new copies, not pointers, since this information
1138 * is to be associated with the new mount, which will be
1139 * unmounted (and its structures freed) separately
1140 */
1141
1142 /*
1143 * Sizes passed to kmem_[z]alloc here must match those freed
1144 * in nfs4_free_args()
1145 */
1146
1147 /*
1148 * We hold sv_lock across kmem_zalloc() calls that may sleep, but this
1149 * is difficult to avoid: as we need to read svp to calculate the
1150 * sizes to be allocated.
1151 */
1152 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1153
1154 esi->esi_hostname = kmem_zalloc(strlen(svp->sv_hostname) + 1, KM_SLEEP);
1155 (void) strcat(esi->esi_hostname, svp->sv_hostname);
1156
1157 esi->esi_addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1158 bufp = esi->esi_addr;
1159 bufp->len = svp->sv_addr.len;
1160 bufp->maxlen = svp->sv_addr.maxlen;
1161 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1162 bcopy(svp->sv_addr.buf, bufp->buf, bufp->len);
1163
1164 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1165 sikncp = esi->esi_knconf;
1166 svkncp = svp->sv_knconf;
1167 sikncp->knc_semantics = svkncp->knc_semantics;
1168 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1169 (void) strcat((char *)sikncp->knc_protofmly,
1170 (char *)svkncp->knc_protofmly);
1171 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1172 (void) strcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto);
1173 sikncp->knc_rdev = svkncp->knc_rdev;
1174
1175 /*
1176 * Used when AUTH_DH is negotiated.
1177 *
1178 * This is ephemeral mount-type specific, since it contains the
1179 * server's time-sync syncaddr.
1180 */
1181 if (svp->sv_dhsec) {
1182 struct netbuf *bufp;
1183 sec_data_t *sdata;
1184 dh_k4_clntdata_t *data;
1185
1186 sdata = svp->sv_dhsec;
1187 data = (dh_k4_clntdata_t *)sdata->data;
1188 ASSERT(sdata->rpcflavor == AUTH_DH);
1189
1190 bufp = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
1191 bufp->len = data->syncaddr.len;
1192 bufp->maxlen = data->syncaddr.maxlen;
1193 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1194 bcopy(data->syncaddr.buf, bufp->buf, bufp->len);
1195 esi->esi_syncaddr = bufp;
1196
1197 if (data->netname != NULL) {
1198 int nmlen = data->netnamelen;
1199
1200 /*
1201 * We need to copy from a dh_k4_clntdata_t
1202 * netname/netnamelen pair to a NUL-terminated
1203 * netname string suitable for putting in nfs_args,
1204 * where the latter has no netnamelen field.
1205 */
1206 esi->esi_netname = kmem_zalloc(nmlen + 1, KM_SLEEP);
1207 bcopy(data->netname, esi->esi_netname, nmlen);
1208 }
1209 } else {
1210 esi->esi_syncaddr = NULL;
1211 esi->esi_netname = NULL;
1212 }
1213
1214 stubpath = fn_path(VTOSV(vp)->sv_name);
1215 /* step over initial '.', to avoid e.g. sv_path: "/tank./ws" */
1216 ASSERT(*stubpath == '.');
1217 stubpath += 1;
1218
1219 /* for nfs_args->fh */
1220 esi->esi_path_len = strlen(stubpath) + 1;
1221 if (strcmp(svp->sv_path, "/") != 0)
1222 esi->esi_path_len += strlen(svp->sv_path);
1223 esi->esi_path = kmem_zalloc(esi->esi_path_len, KM_SLEEP);
1224 if (strcmp(svp->sv_path, "/") != 0)
1225 (void) strcat(esi->esi_path, svp->sv_path);
1226 (void) strcat(esi->esi_path, stubpath);
1227
1228 stubpath -= 1;
1229 /* stubpath allocated by fn_path() */
1230 kmem_free(stubpath, strlen(stubpath) + 1);
1231
1232 nfs_rw_exit(&svp->sv_lock);
1233
1234 return (esi);
1235 }
1236
1237 /*
1238 * Makes an upcall to NFSMAPID daemon to resolve hostname of NFS server to
1239 * get network information required to do the mount call.
1240 */
1241 int
nfs4_callmapid(utf8string * server,struct nfs_fsl_info * resp)1242 nfs4_callmapid(utf8string *server, struct nfs_fsl_info *resp)
1243 {
1244 door_arg_t door_args;
1245 door_handle_t dh;
1246 XDR xdr;
1247 refd_door_args_t *xdr_argsp;
1248 refd_door_res_t *orig_resp;
1249 k_sigset_t smask;
1250 int xdr_len = 0;
1251 int res_len = 16; /* length of an ip adress */
1252 int orig_reslen = res_len;
1253 int error = 0;
1254 struct nfsidmap_globals *nig;
1255
1256 if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
1257 return (ECONNREFUSED);
1258
1259 nig = zone_getspecific(nfsidmap_zone_key, nfs_zone());
1260 ASSERT(nig != NULL);
1261
1262 mutex_enter(&nig->nfsidmap_daemon_lock);
1263 dh = nig->nfsidmap_daemon_dh;
1264 if (dh == NULL) {
1265 mutex_exit(&nig->nfsidmap_daemon_lock);
1266 cmn_err(CE_NOTE,
1267 "nfs4_callmapid: nfsmapid daemon not " \
1268 "running unable to resolve host name\n");
1269 return (EINVAL);
1270 }
1271 door_ki_hold(dh);
1272 mutex_exit(&nig->nfsidmap_daemon_lock);
1273
1274 xdr_len = xdr_sizeof(&(xdr_utf8string), server);
1275
1276 xdr_argsp = kmem_zalloc(xdr_len + sizeof (*xdr_argsp), KM_SLEEP);
1277 xdr_argsp->xdr_len = xdr_len;
1278 xdr_argsp->cmd = NFSMAPID_SRV_NETINFO;
1279
1280 xdrmem_create(&xdr, (char *)&xdr_argsp->xdr_arg,
1281 xdr_len, XDR_ENCODE);
1282
1283 if (!xdr_utf8string(&xdr, server)) {
1284 kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1285 door_ki_rele(dh);
1286 return (1);
1287 }
1288
1289 if (orig_reslen)
1290 orig_resp = kmem_alloc(orig_reslen, KM_SLEEP);
1291
1292 door_args.data_ptr = (char *)xdr_argsp;
1293 door_args.data_size = sizeof (*xdr_argsp) + xdr_argsp->xdr_len;
1294 door_args.desc_ptr = NULL;
1295 door_args.desc_num = 0;
1296 door_args.rbuf = orig_resp ? (char *)orig_resp : NULL;
1297 door_args.rsize = res_len;
1298
1299 sigintr(&smask, 1);
1300 error = door_ki_upcall(dh, &door_args);
1301 sigunintr(&smask);
1302
1303 door_ki_rele(dh);
1304
1305 kmem_free(xdr_argsp, xdr_len + sizeof (*xdr_argsp));
1306 if (error) {
1307 kmem_free(orig_resp, orig_reslen);
1308 /*
1309 * There is no door to connect to. The referral daemon
1310 * must not be running yet.
1311 */
1312 cmn_err(CE_WARN,
1313 "nfsmapid not running cannot resolve host name");
1314 goto out;
1315 }
1316
1317 /*
1318 * If the results buffer passed back are not the same as
1319 * what was sent free the old buffer and use the new one.
1320 */
1321 if (orig_resp && orig_reslen) {
1322 refd_door_res_t *door_resp;
1323
1324 door_resp = (refd_door_res_t *)door_args.rbuf;
1325 if ((void *)door_args.rbuf != orig_resp)
1326 kmem_free(orig_resp, orig_reslen);
1327 if (door_resp->res_status == 0) {
1328 xdrmem_create(&xdr, (char *)&door_resp->xdr_res,
1329 door_resp->xdr_len, XDR_DECODE);
1330 bzero(resp, sizeof (struct nfs_fsl_info));
1331 if (!xdr_nfs_fsl_info(&xdr, resp)) {
1332 DTRACE_PROBE2(
1333 nfs4clnt__debug__referral__upcall__xdrfail,
1334 struct nfs_fsl_info *, resp,
1335 char *, "nfs4_callmapid");
1336 error = EINVAL;
1337 }
1338 } else {
1339 DTRACE_PROBE2(
1340 nfs4clnt__debug__referral__upcall__badstatus,
1341 int, door_resp->res_status,
1342 char *, "nfs4_callmapid");
1343 error = door_resp->res_status;
1344 }
1345 kmem_free(door_args.rbuf, door_args.rsize);
1346 }
1347 out:
1348 DTRACE_PROBE2(nfs4clnt__func__referral__upcall,
1349 char *, server, int, error);
1350 return (error);
1351 }
1352
1353 /*
1354 * Fetches the fs_locations attribute. Typically called
1355 * from a Replication/Migration/Referrals/Mirror-mount context
1356 *
1357 * Fills in the attributes in garp. The caller is assumed
1358 * to have allocated memory for garp.
1359 *
1360 * lock: if set do not lock s_recovlock and mi_recovlock mutex,
1361 * it's already done by caller. Otherwise lock these mutexes
1362 * before doing the rfs4call().
1363 *
1364 * Returns
1365 * 1 for success
1366 * 0 for failure
1367 */
1368 int
nfs4_fetch_locations(mntinfo4_t * mi,nfs4_sharedfh_t * sfh,char * nm,cred_t * cr,nfs4_ga_res_t * garp,COMPOUND4res_clnt * callres,bool_t lock)1369 nfs4_fetch_locations(mntinfo4_t *mi, nfs4_sharedfh_t *sfh, char *nm,
1370 cred_t *cr, nfs4_ga_res_t *garp, COMPOUND4res_clnt *callres, bool_t lock)
1371 {
1372 COMPOUND4args_clnt args;
1373 COMPOUND4res_clnt res;
1374 nfs_argop4 *argop;
1375 int argoplist_size = 3 * sizeof (nfs_argop4);
1376 nfs4_server_t *sp = NULL;
1377 int doqueue = 1;
1378 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
1379 int retval = 1;
1380 struct nfs4_clnt *nfscl;
1381
1382 if (lock == TRUE)
1383 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1384 else
1385 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
1386 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
1387
1388 sp = find_nfs4_server(mi);
1389 if (lock == TRUE)
1390 nfs_rw_exit(&mi->mi_recovlock);
1391
1392 if (sp != NULL)
1393 mutex_exit(&sp->s_lock);
1394
1395 if (lock == TRUE) {
1396 if (sp != NULL)
1397 (void) nfs_rw_enter_sig(&sp->s_recovlock,
1398 RW_WRITER, 0);
1399 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_WRITER, 0);
1400 } else {
1401 if (sp != NULL) {
1402 ASSERT(nfs_rw_lock_held(&sp->s_recovlock, RW_READER) ||
1403 nfs_rw_lock_held(&sp->s_recovlock, RW_WRITER));
1404 }
1405 }
1406
1407 /*
1408 * Do we want to do the setup for recovery here?
1409 *
1410 * We know that the server responded to a null ping a very
1411 * short time ago, and we know that we intend to do a
1412 * single stateless operation - we want to fetch attributes,
1413 * so we know we can't encounter errors about state. If
1414 * something goes wrong with the GETATTR, like not being
1415 * able to get a response from the server or getting any
1416 * kind of FH error, we should fail the mount.
1417 *
1418 * We may want to re-visited this at a later time.
1419 */
1420 argop = kmem_alloc(argoplist_size, KM_SLEEP);
1421
1422 args.ctag = TAG_GETATTR_FSLOCATION;
1423 /* PUTFH LOOKUP GETATTR */
1424 args.array_len = 3;
1425 args.array = argop;
1426
1427 /* 0. putfh file */
1428 argop[0].argop = OP_CPUTFH;
1429 argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1430
1431 /* 1. lookup name, can't be dotdot */
1432 argop[1].argop = OP_CLOOKUP;
1433 argop[1].nfs_argop4_u.opclookup.cname = nm;
1434
1435 /* 2. file attrs */
1436 argop[2].argop = OP_GETATTR;
1437 argop[2].nfs_argop4_u.opgetattr.attr_request =
1438 FATTR4_FSID_MASK | FATTR4_FS_LOCATIONS_MASK |
1439 FATTR4_MOUNTED_ON_FILEID_MASK;
1440 argop[2].nfs_argop4_u.opgetattr.mi = mi;
1441
1442 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1443
1444 if (lock == TRUE) {
1445 nfs_rw_exit(&mi->mi_recovlock);
1446 if (sp != NULL)
1447 nfs_rw_exit(&sp->s_recovlock);
1448 }
1449
1450 nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1451 nfscl->nfscl_stat.referrals.value.ui64++;
1452 DTRACE_PROBE3(nfs4clnt__func__referral__fsloc,
1453 nfs4_sharedfh_t *, sfh, char *, nm, nfs4_error_t *, &e);
1454
1455 if (e.error != 0) {
1456 if (sp != NULL)
1457 nfs4_server_rele(sp);
1458 kmem_free(argop, argoplist_size);
1459 return (0);
1460 }
1461
1462 /*
1463 * Check for all possible error conditions.
1464 * For valid replies without an ops array or for illegal
1465 * replies, return a failure.
1466 */
1467 if (res.status != NFS4_OK || res.array_len < 3 ||
1468 res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
1469 retval = 0;
1470 goto exit;
1471 }
1472
1473 /*
1474 * There isn't much value in putting the attributes
1475 * in the attr cache since fs_locations4 aren't
1476 * encountered very frequently, so just make them
1477 * available to the caller.
1478 */
1479 *garp = res.array[2].nfs_resop4_u.opgetattr.ga_res;
1480
1481 DTRACE_PROBE2(nfs4clnt__debug__referral__fsloc,
1482 nfs4_ga_res_t *, garp, char *, "nfs4_fetch_locations");
1483
1484 /* No fs_locations? -- return a failure */
1485 if (garp->n4g_ext_res == NULL ||
1486 garp->n4g_ext_res->n4g_fslocations.locations_val == NULL) {
1487 retval = 0;
1488 goto exit;
1489 }
1490
1491 if (!garp->n4g_fsid_valid)
1492 retval = 0;
1493
1494 exit:
1495 if (retval == 0) {
1496 /* the call was ok but failed validating the call results */
1497 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1498 } else {
1499 ASSERT(callres != NULL);
1500 *callres = res;
1501 }
1502
1503 if (sp != NULL)
1504 nfs4_server_rele(sp);
1505 kmem_free(argop, argoplist_size);
1506 return (retval);
1507 }
1508
1509 /* tunable to disable referral mounts */
1510 int nfs4_no_referrals = 0;
1511
1512 /*
1513 * Returns NULL if the vnode cannot be created or found.
1514 */
1515 vnode_t *
find_referral_stubvp(vnode_t * dvp,char * nm,cred_t * cr)1516 find_referral_stubvp(vnode_t *dvp, char *nm, cred_t *cr)
1517 {
1518 nfs_fh4 *stub_fh, *dfh;
1519 nfs4_sharedfh_t *sfhp;
1520 char *newfhval;
1521 vnode_t *vp = NULL;
1522 fattr4_mounted_on_fileid mnt_on_fileid;
1523 nfs4_ga_res_t garp;
1524 mntinfo4_t *mi;
1525 COMPOUND4res_clnt callres;
1526 hrtime_t t;
1527
1528 if (nfs4_no_referrals)
1529 return (NULL);
1530
1531 /*
1532 * Get the mounted_on_fileid, unique on that server::fsid
1533 */
1534 mi = VTOMI4(dvp);
1535 if (nfs4_fetch_locations(mi, VTOR4(dvp)->r_fh, nm, cr,
1536 &garp, &callres, FALSE) == 0)
1537 return (NULL);
1538 mnt_on_fileid = garp.n4g_mon_fid;
1539 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1540
1541 /*
1542 * Build a fake filehandle from the dir FH and the mounted_on_fileid
1543 */
1544 dfh = &VTOR4(dvp)->r_fh->sfh_fh;
1545 stub_fh = kmem_alloc(sizeof (nfs_fh4), KM_SLEEP);
1546 stub_fh->nfs_fh4_val = kmem_alloc(dfh->nfs_fh4_len +
1547 sizeof (fattr4_mounted_on_fileid), KM_SLEEP);
1548 newfhval = stub_fh->nfs_fh4_val;
1549
1550 /* copy directory's file handle */
1551 bcopy(dfh->nfs_fh4_val, newfhval, dfh->nfs_fh4_len);
1552 stub_fh->nfs_fh4_len = dfh->nfs_fh4_len;
1553 newfhval = newfhval + dfh->nfs_fh4_len;
1554
1555 /* Add mounted_on_fileid. Use bcopy to avoid alignment problem */
1556 bcopy((char *)&mnt_on_fileid, newfhval,
1557 sizeof (fattr4_mounted_on_fileid));
1558 stub_fh->nfs_fh4_len += sizeof (fattr4_mounted_on_fileid);
1559
1560 sfhp = sfh4_put(stub_fh, VTOMI4(dvp), NULL);
1561 kmem_free(stub_fh->nfs_fh4_val, dfh->nfs_fh4_len +
1562 sizeof (fattr4_mounted_on_fileid));
1563 kmem_free(stub_fh, sizeof (nfs_fh4));
1564 if (sfhp == NULL)
1565 return (NULL);
1566
1567 t = gethrtime();
1568 garp.n4g_va.va_type = VDIR;
1569 vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t,
1570 cr, dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
1571
1572 if (vp != NULL)
1573 vp->v_type = VDIR;
1574
1575 sfh4_rele(&sfhp);
1576 return (vp);
1577 }
1578
1579 int
nfs4_setup_referral(vnode_t * dvp,char * nm,vnode_t ** vpp,cred_t * cr)1580 nfs4_setup_referral(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
1581 {
1582 vnode_t *nvp;
1583 rnode4_t *rp;
1584
1585 if ((nvp = find_referral_stubvp(dvp, nm, cr)) == NULL)
1586 return (EINVAL);
1587
1588 rp = VTOR4(nvp);
1589 mutex_enter(&rp->r_statelock);
1590 r4_stub_referral(rp);
1591 mutex_exit(&rp->r_statelock);
1592 dnlc_enter(dvp, nm, nvp);
1593
1594 if (*vpp != NULL)
1595 VN_RELE(*vpp); /* no longer need this vnode */
1596
1597 *vpp = nvp;
1598
1599 return (0);
1600 }
1601
1602 /*
1603 * Fetch the location information and resolve the new server.
1604 * Caller needs to free up the XDR data which is returned.
1605 * Input: mount info, shared filehandle, nodename
1606 * Return: Index to the result or Error(-1)
1607 * Output: FsLocations Info, Resolved Server Info.
1608 */
1609 int
nfs4_process_referral(mntinfo4_t * mi,nfs4_sharedfh_t * sfh,char * nm,cred_t * cr,nfs4_ga_res_t * grp,COMPOUND4res_clnt * res,struct nfs_fsl_info * fsloc)1610 nfs4_process_referral(mntinfo4_t *mi, nfs4_sharedfh_t *sfh,
1611 char *nm, cred_t *cr, nfs4_ga_res_t *grp, COMPOUND4res_clnt *res,
1612 struct nfs_fsl_info *fsloc)
1613 {
1614 fs_location4 *fsp;
1615 struct nfs_fsl_info nfsfsloc;
1616 int ret, i, error;
1617 nfs4_ga_res_t garp;
1618 COMPOUND4res_clnt callres;
1619 struct knetconfig *knc;
1620
1621 ret = nfs4_fetch_locations(mi, sfh, nm, cr, &garp, &callres, TRUE);
1622 if (ret == 0)
1623 return (-1);
1624
1625 /*
1626 * As a lame attempt to figuring out if we're
1627 * handling a migration event or a referral,
1628 * look for rnodes with this fsid in the rnode
1629 * cache.
1630 *
1631 * If we can find one or more such rnodes, it
1632 * means we're handling a migration event and
1633 * we want to bail out in that case.
1634 */
1635 if (r4find_by_fsid(mi, &garp.n4g_fsid)) {
1636 DTRACE_PROBE3(nfs4clnt__debug__referral__migration,
1637 mntinfo4_t *, mi, nfs4_ga_res_t *, &garp,
1638 char *, "nfs4_process_referral");
1639 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1640 return (-1);
1641 }
1642
1643 /*
1644 * Find the first responsive server to mount. When we find
1645 * one, fsp will point to it.
1646 */
1647 for (i = 0; i < garp.n4g_ext_res->n4g_fslocations.locations_len; i++) {
1648
1649 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[i];
1650 if (fsp->server_len == 0 || fsp->server_val == NULL)
1651 continue;
1652
1653 error = nfs4_callmapid(fsp->server_val, &nfsfsloc);
1654 if (error != 0)
1655 continue;
1656
1657 error = nfs4_ping_server_common(nfsfsloc.knconf,
1658 nfsfsloc.addr, !(mi->mi_flags & MI4_INT));
1659 if (error == RPC_SUCCESS)
1660 break;
1661
1662 DTRACE_PROBE2(nfs4clnt__debug__referral__srvaddr,
1663 sockaddr_in *, (struct sockaddr_in *)nfsfsloc.addr->buf,
1664 char *, "nfs4_process_referral");
1665
1666 (void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1667 }
1668 knc = nfsfsloc.knconf;
1669 if ((i >= garp.n4g_ext_res->n4g_fslocations.locations_len) ||
1670 (knc->knc_protofmly == NULL) || (knc->knc_proto == NULL)) {
1671 DTRACE_PROBE2(nfs4clnt__debug__referral__nofsloc,
1672 nfs4_ga_res_t *, &garp, char *, "nfs4_process_referral");
1673 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1674 return (-1);
1675 }
1676
1677 /* Send the results back */
1678 *fsloc = nfsfsloc;
1679 *grp = garp;
1680 *res = callres;
1681 return (i);
1682 }
1683
1684 /*
1685 * Referrals case - need to fetch referral data and then upcall to
1686 * user-level to get complete mount data.
1687 */
1688 static ephemeral_servinfo_t *
nfs4_trigger_esi_create_referral(vnode_t * vp,cred_t * cr)1689 nfs4_trigger_esi_create_referral(vnode_t *vp, cred_t *cr)
1690 {
1691 struct knetconfig *sikncp, *svkncp;
1692 struct netbuf *bufp;
1693 ephemeral_servinfo_t *esi;
1694 vnode_t *dvp;
1695 rnode4_t *drp;
1696 fs_location4 *fsp;
1697 struct nfs_fsl_info nfsfsloc;
1698 nfs4_ga_res_t garp;
1699 char *p;
1700 char fn[MAXNAMELEN];
1701 int i, index = -1;
1702 mntinfo4_t *mi;
1703 COMPOUND4res_clnt callres;
1704
1705 /*
1706 * If we're passed in a stub vnode that
1707 * isn't a "referral" stub, bail out
1708 * and return a failure
1709 */
1710 if (!RP_ISSTUB_REFERRAL(VTOR4(vp)))
1711 return (NULL);
1712
1713 if (vtodv(vp, &dvp, CRED(), TRUE) != 0)
1714 return (NULL);
1715
1716 drp = VTOR4(dvp);
1717 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
1718 VN_RELE(dvp);
1719 return (NULL);
1720 }
1721
1722 if (vtoname(vp, fn, MAXNAMELEN) != 0) {
1723 nfs_rw_exit(&drp->r_rwlock);
1724 VN_RELE(dvp);
1725 return (NULL);
1726 }
1727
1728 mi = VTOMI4(dvp);
1729 index = nfs4_process_referral(mi, drp->r_fh, fn, cr,
1730 &garp, &callres, &nfsfsloc);
1731 nfs_rw_exit(&drp->r_rwlock);
1732 VN_RELE(dvp);
1733 if (index < 0)
1734 return (NULL);
1735
1736 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1737 esi = kmem_zalloc(sizeof (ephemeral_servinfo_t), KM_SLEEP);
1738
1739 /* initially set to be our type of ephemeral mount; may be added to */
1740 esi->esi_mount_flags = NFSMNT_REFERRAL;
1741
1742 esi->esi_hostname =
1743 kmem_zalloc(fsp->server_val->utf8string_len + 1, KM_SLEEP);
1744 bcopy(fsp->server_val->utf8string_val, esi->esi_hostname,
1745 fsp->server_val->utf8string_len);
1746 esi->esi_hostname[fsp->server_val->utf8string_len] = '\0';
1747
1748 bufp = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
1749 bufp->len = nfsfsloc.addr->len;
1750 bufp->maxlen = nfsfsloc.addr->maxlen;
1751 bufp->buf = kmem_zalloc(bufp->len, KM_SLEEP);
1752 bcopy(nfsfsloc.addr->buf, bufp->buf, bufp->len);
1753 esi->esi_addr = bufp;
1754
1755 esi->esi_knconf = kmem_zalloc(sizeof (*esi->esi_knconf), KM_SLEEP);
1756 sikncp = esi->esi_knconf;
1757
1758 DTRACE_PROBE2(nfs4clnt__debug__referral__nfsfsloc,
1759 struct nfs_fsl_info *, &nfsfsloc,
1760 char *, "nfs4_trigger_esi_create_referral");
1761
1762 svkncp = nfsfsloc.knconf;
1763 sikncp->knc_semantics = svkncp->knc_semantics;
1764 sikncp->knc_protofmly = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1765 (void) strlcat((char *)sikncp->knc_protofmly,
1766 (char *)svkncp->knc_protofmly, KNC_STRSIZE);
1767 sikncp->knc_proto = (caddr_t)kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1768 (void) strlcat((char *)sikncp->knc_proto, (char *)svkncp->knc_proto,
1769 KNC_STRSIZE);
1770 sikncp->knc_rdev = svkncp->knc_rdev;
1771
1772 DTRACE_PROBE2(nfs4clnt__debug__referral__knetconf,
1773 struct knetconfig *, sikncp,
1774 char *, "nfs4_trigger_esi_create_referral");
1775
1776 esi->esi_netname = kmem_zalloc(nfsfsloc.netnm_len, KM_SLEEP);
1777 bcopy(nfsfsloc.netname, esi->esi_netname, nfsfsloc.netnm_len);
1778 esi->esi_syncaddr = NULL;
1779
1780 esi->esi_path = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1781 esi->esi_path_len = MAXPATHLEN;
1782 *p++ = '/';
1783 for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1784 component4 *comp;
1785
1786 comp = &fsp->rootpath.pathname4_val[i];
1787 /* If no space, null the string and bail */
1788 if ((p - esi->esi_path) + comp->utf8string_len + 1 > MAXPATHLEN)
1789 goto err;
1790 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1791 p += comp->utf8string_len;
1792 *p++ = '/';
1793 }
1794 if (fsp->rootpath.pathname4_len != 0)
1795 *(p - 1) = '\0';
1796 else
1797 *p = '\0';
1798 p = esi->esi_path;
1799 esi->esi_path = strdup(p);
1800 esi->esi_path_len = strlen(p) + 1;
1801 kmem_free(p, MAXPATHLEN);
1802
1803 /* Allocated in nfs4_process_referral() */
1804 (void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1805 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1806
1807 return (esi);
1808 err:
1809 kmem_free(esi->esi_path, esi->esi_path_len);
1810 kmem_free(esi->esi_hostname, fsp->server_val->utf8string_len + 1);
1811 kmem_free(esi->esi_addr->buf, esi->esi_addr->len);
1812 kmem_free(esi->esi_addr, sizeof (struct netbuf));
1813 kmem_free(esi->esi_knconf->knc_protofmly, KNC_STRSIZE);
1814 kmem_free(esi->esi_knconf->knc_proto, KNC_STRSIZE);
1815 kmem_free(esi->esi_knconf, sizeof (*esi->esi_knconf));
1816 kmem_free(esi->esi_netname, nfsfsloc.netnm_len);
1817 kmem_free(esi, sizeof (ephemeral_servinfo_t));
1818 (void) xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1819 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1820 return (NULL);
1821 }
1822
1823 /*
1824 * Assemble the args, and call the generic VFS mount function to
1825 * finally perform the ephemeral mount.
1826 */
1827 static int
nfs4_trigger_domount(vnode_t * stubvp,domount_args_t * dma,vfs_t ** vfsp,cred_t * cr,vnode_t ** newvpp)1828 nfs4_trigger_domount(vnode_t *stubvp, domount_args_t *dma, vfs_t **vfsp,
1829 cred_t *cr, vnode_t **newvpp)
1830 {
1831 struct mounta *uap;
1832 char *mntpt, *orig_path, *path;
1833 const char *orig_mntpt;
1834 int retval;
1835 int mntpt_len;
1836 int spec_len;
1837 zone_t *zone = curproc->p_zone;
1838 bool_t has_leading_slash;
1839 int i;
1840
1841 vfs_t *stubvfsp = stubvp->v_vfsp;
1842 ephemeral_servinfo_t *esi = dma->dma_esi;
1843 struct nfs_args *nargs = dma->dma_nargs;
1844
1845 /* first, construct the mount point for the ephemeral mount */
1846 orig_path = path = fn_path(VTOSV(stubvp)->sv_name);
1847 orig_mntpt = (char *)refstr_value(stubvfsp->vfs_mntpt);
1848
1849 if (*orig_path == '.')
1850 orig_path++;
1851
1852 /*
1853 * Get rid of zone's root path
1854 */
1855 if (zone != global_zone) {
1856 /*
1857 * -1 for trailing '/' and -1 for EOS.
1858 */
1859 if (strncmp(zone->zone_rootpath, orig_mntpt,
1860 zone->zone_rootpathlen - 1) == 0) {
1861 orig_mntpt += (zone->zone_rootpathlen - 2);
1862 }
1863 }
1864
1865 mntpt_len = strlen(orig_mntpt) + strlen(orig_path);
1866 mntpt = kmem_zalloc(mntpt_len + 1, KM_SLEEP);
1867 (void) strcat(mntpt, orig_mntpt);
1868 (void) strcat(mntpt, orig_path);
1869
1870 kmem_free(path, strlen(path) + 1);
1871 path = esi->esi_path;
1872 if (*path == '.')
1873 path++;
1874 if (path[0] == '/' && path[1] == '/')
1875 path++;
1876 has_leading_slash = (*path == '/');
1877
1878 spec_len = strlen(dma->dma_hostlist);
1879 spec_len += strlen(path);
1880
1881 /* We are going to have to add this in */
1882 if (!has_leading_slash)
1883 spec_len++;
1884
1885 /* We need to get the ':' for dma_hostlist:esi_path */
1886 spec_len++;
1887
1888 uap = kmem_zalloc(sizeof (struct mounta), KM_SLEEP);
1889 uap->spec = kmem_zalloc(spec_len + 1, KM_SLEEP);
1890 (void) snprintf(uap->spec, spec_len + 1, "%s:%s%s", dma->dma_hostlist,
1891 has_leading_slash ? "" : "/", path);
1892
1893 uap->dir = mntpt;
1894
1895 uap->flags = MS_SYSSPACE | MS_DATA;
1896 /* fstype-independent mount options not covered elsewhere */
1897 /* copy parent's mount(1M) "-m" flag */
1898 if (stubvfsp->vfs_flag & VFS_NOMNTTAB)
1899 uap->flags |= MS_NOMNTTAB;
1900
1901 uap->fstype = MNTTYPE_NFS4;
1902 uap->dataptr = (char *)nargs;
1903 /* not needed for MS_SYSSPACE */
1904 uap->datalen = 0;
1905
1906 /* use optptr to pass in extra mount options */
1907 uap->flags |= MS_OPTIONSTR;
1908 uap->optptr = nfs4_trigger_create_mntopts(stubvfsp);
1909 if (uap->optptr == NULL) {
1910 retval = EINVAL;
1911 goto done;
1912 }
1913
1914 /* domount() expects us to count the trailing NUL */
1915 uap->optlen = strlen(uap->optptr) + 1;
1916
1917 /*
1918 * If we get EBUSY, we try again once to see if we can perform
1919 * the mount. We do this because of a spurious race condition.
1920 */
1921 for (i = 0; i < 2; i++) {
1922 int error;
1923 bool_t was_mounted;
1924
1925 retval = domount(NULL, uap, stubvp, cr, vfsp);
1926 if (retval == 0) {
1927 retval = VFS_ROOT(*vfsp, newvpp);
1928 VFS_RELE(*vfsp);
1929 break;
1930 } else if (retval != EBUSY) {
1931 break;
1932 }
1933
1934 /*
1935 * We might find it mounted by the other racer...
1936 */
1937 error = nfs4_trigger_mounted_already(stubvp,
1938 newvpp, &was_mounted, vfsp);
1939 if (error) {
1940 goto done;
1941 } else if (was_mounted) {
1942 retval = 0;
1943 break;
1944 }
1945 }
1946
1947 done:
1948 if (uap->optptr)
1949 nfs4_trigger_destroy_mntopts(uap->optptr);
1950
1951 kmem_free(uap->spec, spec_len + 1);
1952 kmem_free(uap, sizeof (struct mounta));
1953 kmem_free(mntpt, mntpt_len + 1);
1954
1955 return (retval);
1956 }
1957
1958 /*
1959 * Build an nfs_args structure for passing to domount().
1960 *
1961 * Ephemeral mount-type specific data comes from the ephemeral_servinfo_t;
1962 * generic data - common to all ephemeral mount types - is read directly
1963 * from the parent mount's servinfo4_t and mntinfo4_t, via the stub vnode.
1964 */
1965 static struct nfs_args *
nfs4_trigger_nargs_create(mntinfo4_t * mi,servinfo4_t * svp,ephemeral_servinfo_t * esi)1966 nfs4_trigger_nargs_create(mntinfo4_t *mi, servinfo4_t *svp,
1967 ephemeral_servinfo_t *esi)
1968 {
1969 sec_data_t *secdata;
1970 struct nfs_args *nargs;
1971
1972 /* setup the nfs args */
1973 nargs = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
1974
1975 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1976
1977 nargs->addr = esi->esi_addr;
1978
1979 /* for AUTH_DH by negotiation */
1980 if (esi->esi_syncaddr || esi->esi_netname) {
1981 nargs->flags |= NFSMNT_SECURE;
1982 nargs->syncaddr = esi->esi_syncaddr;
1983 nargs->netname = esi->esi_netname;
1984 }
1985
1986 nargs->flags |= NFSMNT_KNCONF;
1987 nargs->knconf = esi->esi_knconf;
1988 nargs->flags |= NFSMNT_HOSTNAME;
1989 nargs->hostname = esi->esi_hostname;
1990 nargs->fh = esi->esi_path;
1991
1992 /* general mount settings, all copied from parent mount */
1993 mutex_enter(&mi->mi_lock);
1994
1995 if (!(mi->mi_flags & MI4_HARD))
1996 nargs->flags |= NFSMNT_SOFT;
1997
1998 nargs->flags |= NFSMNT_WSIZE | NFSMNT_RSIZE | NFSMNT_TIMEO |
1999 NFSMNT_RETRANS;
2000 nargs->wsize = mi->mi_stsize;
2001 nargs->rsize = mi->mi_tsize;
2002 nargs->timeo = mi->mi_timeo;
2003 nargs->retrans = mi->mi_retrans;
2004
2005 if (mi->mi_flags & MI4_INT)
2006 nargs->flags |= NFSMNT_INT;
2007 if (mi->mi_flags & MI4_NOAC)
2008 nargs->flags |= NFSMNT_NOAC;
2009
2010 nargs->flags |= NFSMNT_ACREGMIN | NFSMNT_ACREGMAX | NFSMNT_ACDIRMIN |
2011 NFSMNT_ACDIRMAX;
2012 nargs->acregmin = HR2SEC(mi->mi_acregmin);
2013 nargs->acregmax = HR2SEC(mi->mi_acregmax);
2014 nargs->acdirmin = HR2SEC(mi->mi_acdirmin);
2015 nargs->acdirmax = HR2SEC(mi->mi_acdirmax);
2016
2017 /* add any specific flags for this type of ephemeral mount */
2018 nargs->flags |= esi->esi_mount_flags;
2019
2020 if (mi->mi_flags & MI4_NOCTO)
2021 nargs->flags |= NFSMNT_NOCTO;
2022 if (mi->mi_flags & MI4_GRPID)
2023 nargs->flags |= NFSMNT_GRPID;
2024 if (mi->mi_flags & MI4_LLOCK)
2025 nargs->flags |= NFSMNT_LLOCK;
2026 if (mi->mi_flags & MI4_NOPRINT)
2027 nargs->flags |= NFSMNT_NOPRINT;
2028 if (mi->mi_flags & MI4_DIRECTIO)
2029 nargs->flags |= NFSMNT_DIRECTIO;
2030 if (mi->mi_flags & MI4_PUBLIC && nargs->flags & NFSMNT_MIRRORMOUNT)
2031 nargs->flags |= NFSMNT_PUBLIC;
2032
2033 /* Do some referral-specific option tweaking */
2034 if (nargs->flags & NFSMNT_REFERRAL) {
2035 nargs->flags &= ~NFSMNT_DORDMA;
2036 nargs->flags |= NFSMNT_TRYRDMA;
2037 }
2038
2039 mutex_exit(&mi->mi_lock);
2040
2041 /*
2042 * Security data & negotiation policy.
2043 *
2044 * For mirror mounts, we need to preserve the parent mount's
2045 * preference for security negotiation, translating SV4_TRYSECDEFAULT
2046 * to NFSMNT_SECDEFAULT if present.
2047 *
2048 * For referrals, we always want security negotiation and will
2049 * set NFSMNT_SECDEFAULT and we will not copy current secdata.
2050 * The reason is that we can't negotiate down from a parent's
2051 * Kerberos flavor to AUTH_SYS.
2052 *
2053 * If SV4_TRYSECDEFAULT is not set, that indicates that a specific
2054 * security flavour was requested, with data in sv_secdata, and that
2055 * no negotiation should occur. If this specified flavour fails, that's
2056 * it. We will copy sv_secdata, and not set NFSMNT_SECDEFAULT.
2057 *
2058 * If SV4_TRYSECDEFAULT is set, then we start with a passed-in
2059 * default flavour, in sv_secdata, but then negotiate a new flavour.
2060 * Possible flavours are recorded in an array in sv_secinfo, with
2061 * currently in-use flavour pointed to by sv_currsec.
2062 *
2063 * If sv_currsec is set, i.e. if negotiation has already occurred,
2064 * we will copy sv_currsec. Otherwise, copy sv_secdata. Regardless,
2065 * we will set NFSMNT_SECDEFAULT, to enable negotiation.
2066 */
2067 if (nargs->flags & NFSMNT_REFERRAL) {
2068 /* enable negotiation for referral mount */
2069 nargs->flags |= NFSMNT_SECDEFAULT;
2070 secdata = kmem_alloc(sizeof (sec_data_t), KM_SLEEP);
2071 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
2072 secdata->data = NULL;
2073 } else if (svp->sv_flags & SV4_TRYSECDEFAULT) {
2074 /* enable negotiation for mirror mount */
2075 nargs->flags |= NFSMNT_SECDEFAULT;
2076
2077 /*
2078 * As a starting point for negotiation, copy parent
2079 * mount's negotiated flavour (sv_currsec) if available,
2080 * or its passed-in flavour (sv_secdata) if not.
2081 */
2082 if (svp->sv_currsec != NULL)
2083 secdata = copy_sec_data(svp->sv_currsec);
2084 else if (svp->sv_secdata != NULL)
2085 secdata = copy_sec_data(svp->sv_secdata);
2086 else
2087 secdata = NULL;
2088 } else {
2089 /* do not enable negotiation; copy parent's passed-in flavour */
2090 if (svp->sv_secdata != NULL)
2091 secdata = copy_sec_data(svp->sv_secdata);
2092 else
2093 secdata = NULL;
2094 }
2095
2096 nfs_rw_exit(&svp->sv_lock);
2097
2098 nargs->flags |= NFSMNT_NEWARGS;
2099 nargs->nfs_args_ext = NFS_ARGS_EXTB;
2100 nargs->nfs_ext_u.nfs_extB.secdata = secdata;
2101
2102 /* for NFS RO failover; caller will set if necessary */
2103 nargs->nfs_ext_u.nfs_extB.next = NULL;
2104
2105 return (nargs);
2106 }
2107
2108 static void
nfs4_trigger_nargs_destroy(struct nfs_args * nargs)2109 nfs4_trigger_nargs_destroy(struct nfs_args *nargs)
2110 {
2111 /*
2112 * Either the mount failed, in which case the data is not needed, or
2113 * nfs4_mount() has either taken copies of what it needs or,
2114 * where it has merely copied the ptr, it has set *our* ptr to NULL,
2115 * whereby nfs4_free_args() will ignore it.
2116 */
2117 nfs4_free_args(nargs);
2118 kmem_free(nargs, sizeof (struct nfs_args));
2119 }
2120
2121 /*
2122 * When we finally get into the mounting, we need to add this
2123 * node to the ephemeral tree.
2124 *
2125 * This is called from nfs4_mount().
2126 */
2127 int
nfs4_record_ephemeral_mount(mntinfo4_t * mi,vnode_t * mvp)2128 nfs4_record_ephemeral_mount(mntinfo4_t *mi, vnode_t *mvp)
2129 {
2130 mntinfo4_t *mi_parent;
2131 nfs4_ephemeral_t *eph;
2132 nfs4_ephemeral_tree_t *net;
2133
2134 nfs4_ephemeral_t *prior;
2135 nfs4_ephemeral_t *child;
2136
2137 nfs4_ephemeral_t *peer;
2138
2139 nfs4_trigger_globals_t *ntg;
2140 zone_t *zone = curproc->p_zone;
2141
2142 int rc = 0;
2143
2144 mi_parent = VTOMI4(mvp);
2145
2146 /*
2147 * Get this before grabbing anything else!
2148 */
2149 ntg = zone_getspecific(nfs4_ephemeral_key, zone);
2150 if (!ntg->ntg_thread_started) {
2151 nfs4_ephemeral_start_harvester(ntg);
2152 }
2153
2154 mutex_enter(&mi_parent->mi_lock);
2155 mutex_enter(&mi->mi_lock);
2156
2157 net = mi->mi_ephemeral_tree =
2158 mi_parent->mi_ephemeral_tree;
2159
2160 /*
2161 * If the mi_ephemeral_tree is NULL, then it
2162 * means that either the harvester or a manual
2163 * umount has cleared the tree out right before
2164 * we got here.
2165 *
2166 * There is nothing we can do here, so return
2167 * to the caller and let them decide whether they
2168 * try again.
2169 */
2170 if (net == NULL) {
2171 mutex_exit(&mi->mi_lock);
2172 mutex_exit(&mi_parent->mi_lock);
2173
2174 return (EBUSY);
2175 }
2176
2177 /*
2178 * We've just tied the mntinfo to the tree, so
2179 * now we bump the refcnt and hold it there until
2180 * this mntinfo is removed from the tree.
2181 */
2182 nfs4_ephemeral_tree_hold(net);
2183
2184 /*
2185 * We need to tack together the ephemeral mount
2186 * with this new mntinfo.
2187 */
2188 eph = kmem_zalloc(sizeof (*eph), KM_SLEEP);
2189 eph->ne_mount = mi;
2190 MI4_HOLD(mi);
2191 VFS_HOLD(mi->mi_vfsp);
2192 eph->ne_ref_time = gethrestime_sec();
2193
2194 /*
2195 * We need to tell the ephemeral mount when
2196 * to time out.
2197 */
2198 eph->ne_mount_to = ntg->ntg_mount_to;
2199
2200 mi->mi_ephemeral = eph;
2201
2202 /*
2203 * If the enclosing mntinfo4 is also ephemeral,
2204 * then we need to point to its enclosing parent.
2205 * Else the enclosing mntinfo4 is the enclosing parent.
2206 *
2207 * We also need to weave this ephemeral node
2208 * into the tree.
2209 */
2210 if (mi_parent->mi_flags & MI4_EPHEMERAL) {
2211 /*
2212 * We need to decide if we are
2213 * the root node of this branch
2214 * or if we are a sibling of this
2215 * branch.
2216 */
2217 prior = mi_parent->mi_ephemeral;
2218 if (prior == NULL) {
2219 /*
2220 * Race condition, clean up, and
2221 * let caller handle mntinfo.
2222 */
2223 mi->mi_flags &= ~MI4_EPHEMERAL;
2224 mi->mi_ephemeral = NULL;
2225 kmem_free(eph, sizeof (*eph));
2226 VFS_RELE(mi->mi_vfsp);
2227 MI4_RELE(mi);
2228 nfs4_ephemeral_tree_rele(net);
2229 rc = EBUSY;
2230 } else {
2231 if (prior->ne_child == NULL) {
2232 prior->ne_child = eph;
2233 } else {
2234 child = prior->ne_child;
2235
2236 prior->ne_child = eph;
2237 eph->ne_peer = child;
2238
2239 child->ne_prior = eph;
2240 }
2241
2242 eph->ne_prior = prior;
2243 }
2244 } else {
2245 /*
2246 * The parent mntinfo4 is the non-ephemeral
2247 * root of the ephemeral tree. We
2248 * need to decide if we are the root
2249 * node of that tree or if we are a
2250 * sibling of the root node.
2251 *
2252 * We are the root if there is no
2253 * other node.
2254 */
2255 if (net->net_root == NULL) {
2256 net->net_root = eph;
2257 } else {
2258 eph->ne_peer = peer = net->net_root;
2259 ASSERT(peer != NULL);
2260 net->net_root = eph;
2261
2262 peer->ne_prior = eph;
2263 }
2264
2265 eph->ne_prior = NULL;
2266 }
2267
2268 mutex_exit(&mi->mi_lock);
2269 mutex_exit(&mi_parent->mi_lock);
2270
2271 return (rc);
2272 }
2273
2274 /*
2275 * Commit the changes to the ephemeral tree for removing this node.
2276 */
2277 static void
nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t * eph)2278 nfs4_ephemeral_umount_cleanup(nfs4_ephemeral_t *eph)
2279 {
2280 nfs4_ephemeral_t *e = eph;
2281 nfs4_ephemeral_t *peer;
2282 nfs4_ephemeral_t *prior;
2283
2284 peer = eph->ne_peer;
2285 prior = e->ne_prior;
2286
2287 /*
2288 * If this branch root was not the
2289 * tree root, then we need to fix back pointers.
2290 */
2291 if (prior) {
2292 if (prior->ne_child == e) {
2293 prior->ne_child = peer;
2294 } else {
2295 prior->ne_peer = peer;
2296 }
2297
2298 if (peer)
2299 peer->ne_prior = prior;
2300 } else if (peer) {
2301 peer->ne_mount->mi_ephemeral_tree->net_root = peer;
2302 peer->ne_prior = NULL;
2303 } else {
2304 e->ne_mount->mi_ephemeral_tree->net_root = NULL;
2305 }
2306 }
2307
2308 /*
2309 * We want to avoid recursion at all costs. So we need to
2310 * unroll the tree. We do this by a depth first traversal to
2311 * leaf nodes. We blast away the leaf and work our way back
2312 * up and down the tree.
2313 */
2314 static int
nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t * eph,int isTreeRoot,int flag,cred_t * cr)2315 nfs4_ephemeral_unmount_engine(nfs4_ephemeral_t *eph,
2316 int isTreeRoot, int flag, cred_t *cr)
2317 {
2318 nfs4_ephemeral_t *e = eph;
2319 nfs4_ephemeral_t *prior;
2320 mntinfo4_t *mi;
2321 vfs_t *vfsp;
2322 int error;
2323
2324 /*
2325 * We use the loop while unrolling the ephemeral tree.
2326 */
2327 for (;;) {
2328 /*
2329 * First we walk down the child.
2330 */
2331 if (e->ne_child) {
2332 prior = e;
2333 e = e->ne_child;
2334 continue;
2335 }
2336
2337 /*
2338 * If we are the root of the branch we are removing,
2339 * we end it here. But if the branch is the root of
2340 * the tree, we have to forge on. We do not consider
2341 * the peer list for the root because while it may
2342 * be okay to remove, it is both extra work and a
2343 * potential for a false-positive error to stall the
2344 * unmount attempt.
2345 */
2346 if (e == eph && isTreeRoot == FALSE)
2347 return (0);
2348
2349 /*
2350 * Next we walk down the peer list.
2351 */
2352 if (e->ne_peer) {
2353 prior = e;
2354 e = e->ne_peer;
2355 continue;
2356 }
2357
2358 /*
2359 * We can only remove the node passed in by the
2360 * caller if it is the root of the ephemeral tree.
2361 * Otherwise, the caller will remove it.
2362 */
2363 if (e == eph && isTreeRoot == FALSE)
2364 return (0);
2365
2366 /*
2367 * Okay, we have a leaf node, time
2368 * to prune it!
2369 *
2370 * Note that prior can only be NULL if
2371 * and only if it is the root of the
2372 * ephemeral tree.
2373 */
2374 prior = e->ne_prior;
2375
2376 mi = e->ne_mount;
2377 mutex_enter(&mi->mi_lock);
2378 vfsp = mi->mi_vfsp;
2379 ASSERT(vfsp != NULL);
2380
2381 /*
2382 * Cleared by umount2_engine.
2383 */
2384 VFS_HOLD(vfsp);
2385
2386 /*
2387 * Inform nfs4_unmount to not recursively
2388 * descend into this node's children when it
2389 * gets processed.
2390 */
2391 mi->mi_flags |= MI4_EPHEMERAL_RECURSED;
2392 mutex_exit(&mi->mi_lock);
2393
2394 error = umount2_engine(vfsp, flag, cr, FALSE);
2395 if (error) {
2396 /*
2397 * We need to reenable nfs4_unmount's ability
2398 * to recursively descend on this node.
2399 */
2400 mutex_enter(&mi->mi_lock);
2401 mi->mi_flags &= ~MI4_EPHEMERAL_RECURSED;
2402 mutex_exit(&mi->mi_lock);
2403
2404 return (error);
2405 }
2406
2407 /*
2408 * If we are the current node, we do not want to
2409 * touch anything else. At this point, the only
2410 * way the current node can have survived to here
2411 * is if it is the root of the ephemeral tree and
2412 * we are unmounting the enclosing mntinfo4.
2413 */
2414 if (e == eph) {
2415 ASSERT(prior == NULL);
2416 return (0);
2417 }
2418
2419 /*
2420 * Stitch up the prior node. Note that since
2421 * we have handled the root of the tree, prior
2422 * must be non-NULL.
2423 */
2424 ASSERT(prior != NULL);
2425 if (prior->ne_child == e) {
2426 prior->ne_child = NULL;
2427 } else {
2428 ASSERT(prior->ne_peer == e);
2429
2430 prior->ne_peer = NULL;
2431 }
2432
2433 e = prior;
2434 }
2435
2436 /* NOTREACHED */
2437 }
2438
2439 /*
2440 * Common code to safely release net_cnt_lock and net_tree_lock
2441 */
2442 void
nfs4_ephemeral_umount_unlock(bool_t * pmust_unlock,nfs4_ephemeral_tree_t ** pnet)2443 nfs4_ephemeral_umount_unlock(bool_t *pmust_unlock,
2444 nfs4_ephemeral_tree_t **pnet)
2445 {
2446 nfs4_ephemeral_tree_t *net = *pnet;
2447
2448 if (*pmust_unlock) {
2449 mutex_enter(&net->net_cnt_lock);
2450 net->net_status &= ~NFS4_EPHEMERAL_TREE_UMOUNTING;
2451 mutex_exit(&net->net_cnt_lock);
2452
2453 mutex_exit(&net->net_tree_lock);
2454
2455 *pmust_unlock = FALSE;
2456 }
2457 }
2458
2459 /*
2460 * While we may have removed any child or sibling nodes of this
2461 * ephemeral node, we can not nuke it until we know that there
2462 * were no actived vnodes on it. This will do that final
2463 * work once we know it is not busy.
2464 */
2465 void
nfs4_ephemeral_umount_activate(mntinfo4_t * mi,bool_t * pmust_unlock,nfs4_ephemeral_tree_t ** pnet)2466 nfs4_ephemeral_umount_activate(mntinfo4_t *mi, bool_t *pmust_unlock,
2467 nfs4_ephemeral_tree_t **pnet)
2468 {
2469 /*
2470 * Now we need to get rid of the ephemeral data if it exists.
2471 */
2472 mutex_enter(&mi->mi_lock);
2473 if (mi->mi_ephemeral) {
2474 /*
2475 * If we are the root node of an ephemeral branch
2476 * which is being removed, then we need to fixup
2477 * pointers into and out of the node.
2478 */
2479 if (!(mi->mi_flags & MI4_EPHEMERAL_RECURSED))
2480 nfs4_ephemeral_umount_cleanup(mi->mi_ephemeral);
2481
2482 nfs4_ephemeral_tree_rele(*pnet);
2483 ASSERT(mi->mi_ephemeral != NULL);
2484
2485 kmem_free(mi->mi_ephemeral, sizeof (*mi->mi_ephemeral));
2486 mi->mi_ephemeral = NULL;
2487 VFS_RELE(mi->mi_vfsp);
2488 MI4_RELE(mi);
2489 }
2490 mutex_exit(&mi->mi_lock);
2491
2492 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2493 }
2494
2495 /*
2496 * Unmount an ephemeral node.
2497 *
2498 * Note that if this code fails, then it must unlock.
2499 *
2500 * If it succeeds, then the caller must be prepared to do so.
2501 */
2502 int
nfs4_ephemeral_umount(mntinfo4_t * mi,int flag,cred_t * cr,bool_t * pmust_unlock,nfs4_ephemeral_tree_t ** pnet)2503 nfs4_ephemeral_umount(mntinfo4_t *mi, int flag, cred_t *cr,
2504 bool_t *pmust_unlock, nfs4_ephemeral_tree_t **pnet)
2505 {
2506 int error = 0;
2507 nfs4_ephemeral_t *eph;
2508 nfs4_ephemeral_tree_t *net;
2509 int is_derooting = FALSE;
2510 int is_recursed = FALSE;
2511 int was_locked = FALSE;
2512
2513 /*
2514 * Make sure to set the default state for cleaning
2515 * up the tree in the caller (and on the way out).
2516 */
2517 *pmust_unlock = FALSE;
2518
2519 /*
2520 * The active vnodes on this file system may be ephemeral
2521 * children. We need to check for and try to unmount them
2522 * here. If any can not be unmounted, we are going
2523 * to return EBUSY.
2524 */
2525 mutex_enter(&mi->mi_lock);
2526
2527 /*
2528 * If an ephemeral tree, we need to check to see if
2529 * the lock is already held. If it is, then we need
2530 * to see if we are being called as a result of
2531 * the recursive removal of some node of the tree or
2532 * if we are another attempt to remove the tree.
2533 *
2534 * mi_flags & MI4_EPHEMERAL indicates an ephemeral
2535 * node. mi_ephemeral being non-NULL also does this.
2536 *
2537 * mi_ephemeral_tree being non-NULL is sufficient
2538 * to also indicate either it is an ephemeral node
2539 * or the enclosing mntinfo4.
2540 *
2541 * Do we need MI4_EPHEMERAL? Yes, it is useful for
2542 * when we delete the ephemeral node and need to
2543 * differentiate from an ephemeral node and the
2544 * enclosing root node.
2545 */
2546 *pnet = net = mi->mi_ephemeral_tree;
2547 if (net == NULL) {
2548 mutex_exit(&mi->mi_lock);
2549 return (0);
2550 }
2551
2552 eph = mi->mi_ephemeral;
2553 is_recursed = mi->mi_flags & MI4_EPHEMERAL_RECURSED;
2554 is_derooting = (eph == NULL);
2555
2556 mutex_enter(&net->net_cnt_lock);
2557
2558 /*
2559 * If this is not recursion, then we need to
2560 * check to see if a harvester thread has
2561 * already grabbed the lock.
2562 *
2563 * After we exit this branch, we may not
2564 * blindly return, we need to jump to
2565 * is_busy!
2566 */
2567 if (!is_recursed) {
2568 if (net->net_status &
2569 NFS4_EPHEMERAL_TREE_LOCKED) {
2570 /*
2571 * If the tree is locked, we need
2572 * to decide whether we are the
2573 * harvester or some explicit call
2574 * for a umount. The only way that
2575 * we are the harvester is if
2576 * MS_SYSSPACE is set.
2577 *
2578 * We only let the harvester through
2579 * at this point.
2580 *
2581 * We return EBUSY so that the
2582 * caller knows something is
2583 * going on. Note that by that
2584 * time, the umount in the other
2585 * thread may have already occured.
2586 */
2587 if (!(flag & MS_SYSSPACE)) {
2588 mutex_exit(&net->net_cnt_lock);
2589 mutex_exit(&mi->mi_lock);
2590
2591 return (EBUSY);
2592 }
2593
2594 was_locked = TRUE;
2595 }
2596 }
2597
2598 mutex_exit(&net->net_cnt_lock);
2599 mutex_exit(&mi->mi_lock);
2600
2601 /*
2602 * If we are not the harvester, we need to check
2603 * to see if we need to grab the tree lock.
2604 */
2605 if (was_locked == FALSE) {
2606 /*
2607 * If we grab the lock, it means that no other
2608 * operation is working on the tree. If we don't
2609 * grab it, we need to decide if this is because
2610 * we are a recursive call or a new operation.
2611 */
2612 if (mutex_tryenter(&net->net_tree_lock)) {
2613 *pmust_unlock = TRUE;
2614 } else {
2615 /*
2616 * If we are a recursive call, we can
2617 * proceed without the lock.
2618 * Otherwise we have to wait until
2619 * the lock becomes free.
2620 */
2621 if (!is_recursed) {
2622 mutex_enter(&net->net_cnt_lock);
2623 if (net->net_status &
2624 (NFS4_EPHEMERAL_TREE_DEROOTING
2625 | NFS4_EPHEMERAL_TREE_INVALID)) {
2626 mutex_exit(&net->net_cnt_lock);
2627 goto is_busy;
2628 }
2629 mutex_exit(&net->net_cnt_lock);
2630
2631 /*
2632 * We can't hold any other locks whilst
2633 * we wait on this to free up.
2634 */
2635 mutex_enter(&net->net_tree_lock);
2636
2637 /*
2638 * Note that while mi->mi_ephemeral
2639 * may change and thus we have to
2640 * update eph, it is the case that
2641 * we have tied down net and
2642 * do not care if mi->mi_ephemeral_tree
2643 * has changed.
2644 */
2645 mutex_enter(&mi->mi_lock);
2646 eph = mi->mi_ephemeral;
2647 mutex_exit(&mi->mi_lock);
2648
2649 /*
2650 * Okay, we need to see if either the
2651 * tree got nuked or the current node
2652 * got nuked. Both of which will cause
2653 * an error.
2654 *
2655 * Note that a subsequent retry of the
2656 * umount shall work.
2657 */
2658 mutex_enter(&net->net_cnt_lock);
2659 if (net->net_status &
2660 NFS4_EPHEMERAL_TREE_INVALID ||
2661 (!is_derooting && eph == NULL)) {
2662 mutex_exit(&net->net_cnt_lock);
2663 mutex_exit(&net->net_tree_lock);
2664 goto is_busy;
2665 }
2666 mutex_exit(&net->net_cnt_lock);
2667 *pmust_unlock = TRUE;
2668 }
2669 }
2670 }
2671
2672 /*
2673 * Only once we have grabbed the lock can we mark what we
2674 * are planning on doing to the ephemeral tree.
2675 */
2676 if (*pmust_unlock) {
2677 mutex_enter(&net->net_cnt_lock);
2678 net->net_status |= NFS4_EPHEMERAL_TREE_UMOUNTING;
2679
2680 /*
2681 * Check to see if we are nuking the root.
2682 */
2683 if (is_derooting)
2684 net->net_status |=
2685 NFS4_EPHEMERAL_TREE_DEROOTING;
2686 mutex_exit(&net->net_cnt_lock);
2687 }
2688
2689 if (!is_derooting) {
2690 /*
2691 * Only work on children if the caller has not already
2692 * done so.
2693 */
2694 if (!is_recursed) {
2695 ASSERT(eph != NULL);
2696
2697 error = nfs4_ephemeral_unmount_engine(eph,
2698 FALSE, flag, cr);
2699 if (error)
2700 goto is_busy;
2701 }
2702 } else {
2703 eph = net->net_root;
2704
2705 /*
2706 * Only work if there is something there.
2707 */
2708 if (eph) {
2709 error = nfs4_ephemeral_unmount_engine(eph, TRUE,
2710 flag, cr);
2711 if (error) {
2712 mutex_enter(&net->net_cnt_lock);
2713 net->net_status &=
2714 ~NFS4_EPHEMERAL_TREE_DEROOTING;
2715 mutex_exit(&net->net_cnt_lock);
2716 goto is_busy;
2717 }
2718
2719 /*
2720 * Nothing else which goes wrong will
2721 * invalidate the blowing away of the
2722 * ephmeral tree.
2723 */
2724 net->net_root = NULL;
2725 }
2726
2727 /*
2728 * We have derooted and we have caused the tree to be
2729 * invalidated.
2730 */
2731 mutex_enter(&net->net_cnt_lock);
2732 net->net_status &= ~NFS4_EPHEMERAL_TREE_DEROOTING;
2733 net->net_status |= NFS4_EPHEMERAL_TREE_INVALID;
2734 DTRACE_NFSV4_1(nfs4clnt__dbg__ephemeral__tree__derooting,
2735 uint_t, net->net_refcnt);
2736
2737 /*
2738 * We will not finalize this node, so safe to
2739 * release it.
2740 */
2741 nfs4_ephemeral_tree_decr(net);
2742 mutex_exit(&net->net_cnt_lock);
2743
2744 if (was_locked == FALSE)
2745 mutex_exit(&net->net_tree_lock);
2746
2747 /*
2748 * We have just blown away any notation of this
2749 * tree being locked or having a refcnt.
2750 * We can't let the caller try to clean things up.
2751 */
2752 *pmust_unlock = FALSE;
2753
2754 /*
2755 * At this point, the tree should no longer be
2756 * associated with the mntinfo4. We need to pull
2757 * it off there and let the harvester take
2758 * care of it once the refcnt drops.
2759 */
2760 mutex_enter(&mi->mi_lock);
2761 mi->mi_ephemeral_tree = NULL;
2762 mutex_exit(&mi->mi_lock);
2763 }
2764
2765 return (0);
2766
2767 is_busy:
2768
2769 nfs4_ephemeral_umount_unlock(pmust_unlock, pnet);
2770
2771 return (error);
2772 }
2773
2774 /*
2775 * Do the umount and record any error in the parent.
2776 */
2777 static void
nfs4_ephemeral_record_umount(vfs_t * vfsp,int flag,nfs4_ephemeral_t * e,nfs4_ephemeral_t * prior)2778 nfs4_ephemeral_record_umount(vfs_t *vfsp, int flag,
2779 nfs4_ephemeral_t *e, nfs4_ephemeral_t *prior)
2780 {
2781 int error;
2782
2783 /*
2784 * Only act on if the fs is still mounted.
2785 */
2786 if (vfsp == NULL)
2787 return;
2788
2789 error = umount2_engine(vfsp, flag, kcred, FALSE);
2790 if (error) {
2791 if (prior) {
2792 if (prior->ne_child == e)
2793 prior->ne_state |=
2794 NFS4_EPHEMERAL_CHILD_ERROR;
2795 else
2796 prior->ne_state |=
2797 NFS4_EPHEMERAL_PEER_ERROR;
2798 }
2799 }
2800 }
2801
2802 /*
2803 * For each tree in the forest (where the forest is in
2804 * effect all of the ephemeral trees for this zone),
2805 * scan to see if a node can be unmounted. Note that
2806 * unlike nfs4_ephemeral_unmount_engine(), we do
2807 * not process the current node before children or
2808 * siblings. I.e., if a node can be unmounted, we
2809 * do not recursively check to see if the nodes
2810 * hanging off of it can also be unmounted.
2811 *
2812 * Instead, we delve down deep to try and remove the
2813 * children first. Then, because we share code with
2814 * nfs4_ephemeral_unmount_engine(), we will try
2815 * them again. This could be a performance issue in
2816 * the future.
2817 *
2818 * Also note that unlike nfs4_ephemeral_unmount_engine(),
2819 * we do not halt on an error. We will not remove the
2820 * current node, but we will keep on trying to remove
2821 * the others.
2822 *
2823 * force indicates that we want the unmount to occur
2824 * even if there is something blocking it.
2825 *
2826 * time_check indicates that we want to see if the
2827 * mount has expired past mount_to or not. Typically
2828 * we want to do this and only on a shutdown of the
2829 * zone would we want to ignore the check.
2830 */
2831 static void
nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t * ntg,bool_t force,bool_t time_check)2832 nfs4_ephemeral_harvest_forest(nfs4_trigger_globals_t *ntg,
2833 bool_t force, bool_t time_check)
2834 {
2835 nfs4_ephemeral_tree_t *net;
2836 nfs4_ephemeral_tree_t *prev = NULL;
2837 nfs4_ephemeral_tree_t *next;
2838 nfs4_ephemeral_t *e;
2839 nfs4_ephemeral_t *prior;
2840 time_t now = gethrestime_sec();
2841
2842 nfs4_ephemeral_tree_t *harvest = NULL;
2843
2844 int flag;
2845
2846 mntinfo4_t *mi;
2847 vfs_t *vfsp;
2848
2849 if (force)
2850 flag = MS_FORCE | MS_SYSSPACE;
2851 else
2852 flag = MS_SYSSPACE;
2853
2854 mutex_enter(&ntg->ntg_forest_lock);
2855 for (net = ntg->ntg_forest; net != NULL; net = next) {
2856 next = net->net_next;
2857
2858 nfs4_ephemeral_tree_hold(net);
2859
2860 mutex_enter(&net->net_tree_lock);
2861
2862 /*
2863 * Let the unmount code know that the
2864 * tree is already locked!
2865 */
2866 mutex_enter(&net->net_cnt_lock);
2867 net->net_status |= NFS4_EPHEMERAL_TREE_LOCKED;
2868 mutex_exit(&net->net_cnt_lock);
2869
2870 /*
2871 * If the intent is force all ephemeral nodes to
2872 * be unmounted in this zone, we can short circuit a
2873 * lot of tree traversal and simply zap the root node.
2874 */
2875 if (force) {
2876 if (net->net_root) {
2877 mi = net->net_root->ne_mount;
2878
2879 vfsp = mi->mi_vfsp;
2880 ASSERT(vfsp != NULL);
2881
2882 /*
2883 * Cleared by umount2_engine.
2884 */
2885 VFS_HOLD(vfsp);
2886
2887 (void) umount2_engine(vfsp, flag,
2888 kcred, FALSE);
2889
2890 goto check_done;
2891 }
2892 }
2893
2894 e = net->net_root;
2895 if (e)
2896 e->ne_state = NFS4_EPHEMERAL_VISIT_CHILD;
2897
2898 while (e) {
2899 if (e->ne_state == NFS4_EPHEMERAL_VISIT_CHILD) {
2900 e->ne_state = NFS4_EPHEMERAL_VISIT_SIBLING;
2901 if (e->ne_child) {
2902 e = e->ne_child;
2903 e->ne_state =
2904 NFS4_EPHEMERAL_VISIT_CHILD;
2905 }
2906
2907 continue;
2908 } else if (e->ne_state ==
2909 NFS4_EPHEMERAL_VISIT_SIBLING) {
2910 e->ne_state = NFS4_EPHEMERAL_PROCESS_ME;
2911 if (e->ne_peer) {
2912 e = e->ne_peer;
2913 e->ne_state =
2914 NFS4_EPHEMERAL_VISIT_CHILD;
2915 }
2916
2917 continue;
2918 } else if (e->ne_state ==
2919 NFS4_EPHEMERAL_CHILD_ERROR) {
2920 prior = e->ne_prior;
2921
2922 /*
2923 * If a child reported an error, do
2924 * not bother trying to unmount.
2925 *
2926 * If your prior node is a parent,
2927 * pass the error up such that they
2928 * also do not try to unmount.
2929 *
2930 * However, if your prior is a sibling,
2931 * let them try to unmount if they can.
2932 */
2933 if (prior) {
2934 if (prior->ne_child == e)
2935 prior->ne_state |=
2936 NFS4_EPHEMERAL_CHILD_ERROR;
2937 else
2938 prior->ne_state |=
2939 NFS4_EPHEMERAL_PEER_ERROR;
2940 }
2941
2942 /*
2943 * Clear the error and if needed, process peers.
2944 *
2945 * Once we mask out the error, we know whether
2946 * or we have to process another node.
2947 */
2948 e->ne_state &= ~NFS4_EPHEMERAL_CHILD_ERROR;
2949 if (e->ne_state == NFS4_EPHEMERAL_PROCESS_ME)
2950 e = prior;
2951
2952 continue;
2953 } else if (e->ne_state ==
2954 NFS4_EPHEMERAL_PEER_ERROR) {
2955 prior = e->ne_prior;
2956
2957 if (prior) {
2958 if (prior->ne_child == e)
2959 prior->ne_state =
2960 NFS4_EPHEMERAL_CHILD_ERROR;
2961 else
2962 prior->ne_state =
2963 NFS4_EPHEMERAL_PEER_ERROR;
2964 }
2965
2966 /*
2967 * Clear the error from this node and do the
2968 * correct processing.
2969 */
2970 e->ne_state &= ~NFS4_EPHEMERAL_PEER_ERROR;
2971 continue;
2972 }
2973
2974 prior = e->ne_prior;
2975 e->ne_state = NFS4_EPHEMERAL_OK;
2976
2977 /*
2978 * It must be the case that we need to process
2979 * this node.
2980 */
2981 if (!time_check ||
2982 now - e->ne_ref_time > e->ne_mount_to) {
2983 mi = e->ne_mount;
2984 vfsp = mi->mi_vfsp;
2985
2986 /*
2987 * Cleared by umount2_engine.
2988 */
2989 if (vfsp != NULL)
2990 VFS_HOLD(vfsp);
2991
2992 /*
2993 * Note that we effectively work down to the
2994 * leaf nodes first, try to unmount them,
2995 * then work our way back up into the leaf
2996 * nodes.
2997 *
2998 * Also note that we deal with a lot of
2999 * complexity by sharing the work with
3000 * the manual unmount code.
3001 */
3002 nfs4_ephemeral_record_umount(vfsp, flag,
3003 e, prior);
3004 }
3005
3006 e = prior;
3007 }
3008
3009 check_done:
3010
3011 /*
3012 * At this point we are done processing this tree.
3013 *
3014 * If the tree is invalid and we were the only reference
3015 * to it, then we push it on the local linked list
3016 * to remove it at the end. We avoid that action now
3017 * to keep the tree processing going along at a fair clip.
3018 *
3019 * Else, even if we were the only reference, we
3020 * allow it to be reused as needed.
3021 */
3022 mutex_enter(&net->net_cnt_lock);
3023 nfs4_ephemeral_tree_decr(net);
3024 if (net->net_refcnt == 0 &&
3025 net->net_status & NFS4_EPHEMERAL_TREE_INVALID) {
3026 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3027 mutex_exit(&net->net_cnt_lock);
3028 mutex_exit(&net->net_tree_lock);
3029
3030 if (prev)
3031 prev->net_next = net->net_next;
3032 else
3033 ntg->ntg_forest = net->net_next;
3034
3035 net->net_next = harvest;
3036 harvest = net;
3037
3038 VFS_RELE(net->net_mount->mi_vfsp);
3039 MI4_RELE(net->net_mount);
3040
3041 continue;
3042 }
3043
3044 net->net_status &= ~NFS4_EPHEMERAL_TREE_LOCKED;
3045 mutex_exit(&net->net_cnt_lock);
3046 mutex_exit(&net->net_tree_lock);
3047
3048 prev = net;
3049 }
3050 mutex_exit(&ntg->ntg_forest_lock);
3051
3052 for (net = harvest; net != NULL; net = next) {
3053 next = net->net_next;
3054
3055 mutex_destroy(&net->net_tree_lock);
3056 mutex_destroy(&net->net_cnt_lock);
3057 kmem_free(net, sizeof (*net));
3058 }
3059 }
3060
3061 /*
3062 * This is the thread which decides when the harvesting
3063 * can proceed and when to kill it off for this zone.
3064 */
3065 static void
nfs4_ephemeral_harvester(nfs4_trigger_globals_t * ntg)3066 nfs4_ephemeral_harvester(nfs4_trigger_globals_t *ntg)
3067 {
3068 clock_t timeleft;
3069 zone_t *zone = curproc->p_zone;
3070
3071 for (;;) {
3072 timeleft = zone_status_timedwait(zone, ddi_get_lbolt() +
3073 nfs4_trigger_thread_timer * hz, ZONE_IS_SHUTTING_DOWN);
3074
3075 /*
3076 * zone is exiting...
3077 */
3078 if (timeleft != -1) {
3079 ASSERT(zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN);
3080 zthread_exit();
3081 /* NOTREACHED */
3082 }
3083
3084 /*
3085 * Only bother scanning if there is potential
3086 * work to be done.
3087 */
3088 if (ntg->ntg_forest == NULL)
3089 continue;
3090
3091 /*
3092 * Now scan the list and get rid of everything which
3093 * is old.
3094 */
3095 nfs4_ephemeral_harvest_forest(ntg, FALSE, TRUE);
3096 }
3097
3098 /* NOTREACHED */
3099 }
3100
3101 /*
3102 * The zone specific glue needed to start the unmount harvester.
3103 *
3104 * Note that we want to avoid holding the mutex as long as possible,
3105 * hence the multiple checks.
3106 *
3107 * The caller should avoid us getting down here in the first
3108 * place.
3109 */
3110 static void
nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t * ntg)3111 nfs4_ephemeral_start_harvester(nfs4_trigger_globals_t *ntg)
3112 {
3113 /*
3114 * It got started before we got here...
3115 */
3116 if (ntg->ntg_thread_started)
3117 return;
3118
3119 mutex_enter(&nfs4_ephemeral_thread_lock);
3120
3121 if (ntg->ntg_thread_started) {
3122 mutex_exit(&nfs4_ephemeral_thread_lock);
3123 return;
3124 }
3125
3126 /*
3127 * Start the unmounter harvester thread for this zone.
3128 */
3129 (void) zthread_create(NULL, 0, nfs4_ephemeral_harvester,
3130 ntg, 0, minclsyspri);
3131
3132 ntg->ntg_thread_started = TRUE;
3133 mutex_exit(&nfs4_ephemeral_thread_lock);
3134 }
3135
3136 /*ARGSUSED*/
3137 static void *
nfs4_ephemeral_zsd_create(zoneid_t zoneid)3138 nfs4_ephemeral_zsd_create(zoneid_t zoneid)
3139 {
3140 nfs4_trigger_globals_t *ntg;
3141
3142 ntg = kmem_zalloc(sizeof (*ntg), KM_SLEEP);
3143 ntg->ntg_thread_started = FALSE;
3144
3145 /*
3146 * This is the default....
3147 */
3148 ntg->ntg_mount_to = nfs4_trigger_thread_timer;
3149
3150 mutex_init(&ntg->ntg_forest_lock, NULL,
3151 MUTEX_DEFAULT, NULL);
3152
3153 return (ntg);
3154 }
3155
3156 /*
3157 * Try a nice gentle walk down the forest and convince
3158 * all of the trees to gracefully give it up.
3159 */
3160 /*ARGSUSED*/
3161 static void
nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid,void * arg)3162 nfs4_ephemeral_zsd_shutdown(zoneid_t zoneid, void *arg)
3163 {
3164 nfs4_trigger_globals_t *ntg = arg;
3165
3166 if (!ntg)
3167 return;
3168
3169 nfs4_ephemeral_harvest_forest(ntg, FALSE, FALSE);
3170 }
3171
3172 /*
3173 * Race along the forest and rip all of the trees out by
3174 * their rootballs!
3175 */
3176 /*ARGSUSED*/
3177 static void
nfs4_ephemeral_zsd_destroy(zoneid_t zoneid,void * arg)3178 nfs4_ephemeral_zsd_destroy(zoneid_t zoneid, void *arg)
3179 {
3180 nfs4_trigger_globals_t *ntg = arg;
3181
3182 if (!ntg)
3183 return;
3184
3185 nfs4_ephemeral_harvest_forest(ntg, TRUE, FALSE);
3186
3187 mutex_destroy(&ntg->ntg_forest_lock);
3188 kmem_free(ntg, sizeof (*ntg));
3189 }
3190
3191 /*
3192 * This is the zone independent cleanup needed for
3193 * emphemeral mount processing.
3194 */
3195 void
nfs4_ephemeral_fini(void)3196 nfs4_ephemeral_fini(void)
3197 {
3198 (void) zone_key_delete(nfs4_ephemeral_key);
3199 mutex_destroy(&nfs4_ephemeral_thread_lock);
3200 }
3201
3202 /*
3203 * This is the zone independent initialization needed for
3204 * emphemeral mount processing.
3205 */
3206 void
nfs4_ephemeral_init(void)3207 nfs4_ephemeral_init(void)
3208 {
3209 mutex_init(&nfs4_ephemeral_thread_lock, NULL, MUTEX_DEFAULT,
3210 NULL);
3211
3212 zone_key_create(&nfs4_ephemeral_key, nfs4_ephemeral_zsd_create,
3213 nfs4_ephemeral_zsd_shutdown, nfs4_ephemeral_zsd_destroy);
3214 }
3215
3216 /*
3217 * nfssys() calls this function to set the per-zone
3218 * value of mount_to to drive when an ephemeral mount is
3219 * timed out. Each mount will grab a copy of this value
3220 * when mounted.
3221 */
3222 void
nfs4_ephemeral_set_mount_to(uint_t mount_to)3223 nfs4_ephemeral_set_mount_to(uint_t mount_to)
3224 {
3225 nfs4_trigger_globals_t *ntg;
3226 zone_t *zone = curproc->p_zone;
3227
3228 ntg = zone_getspecific(nfs4_ephemeral_key, zone);
3229
3230 ntg->ntg_mount_to = mount_to;
3231 }
3232
3233 /*
3234 * Walk the list of v4 mount options; if they are currently set in vfsp,
3235 * append them to a new comma-separated mount option string, and return it.
3236 *
3237 * Caller should free by calling nfs4_trigger_destroy_mntopts().
3238 */
3239 static char *
nfs4_trigger_create_mntopts(vfs_t * vfsp)3240 nfs4_trigger_create_mntopts(vfs_t *vfsp)
3241 {
3242 uint_t i;
3243 char *mntopts;
3244 struct vfssw *vswp;
3245 mntopts_t *optproto;
3246
3247 mntopts = kmem_zalloc(MAX_MNTOPT_STR, KM_SLEEP);
3248
3249 /* get the list of applicable mount options for v4; locks *vswp */
3250 vswp = vfs_getvfssw(MNTTYPE_NFS4);
3251 optproto = &vswp->vsw_optproto;
3252
3253 for (i = 0; i < optproto->mo_count; i++) {
3254 struct mntopt *mop = &optproto->mo_list[i];
3255
3256 if (mop->mo_flags & MO_EMPTY)
3257 continue;
3258
3259 if (nfs4_trigger_add_mntopt(mntopts, mop->mo_name, vfsp)) {
3260 kmem_free(mntopts, MAX_MNTOPT_STR);
3261 vfs_unrefvfssw(vswp);
3262 return (NULL);
3263 }
3264 }
3265
3266 vfs_unrefvfssw(vswp);
3267
3268 /*
3269 * MNTOPT_XATTR is not in the v4 mount opt proto list,
3270 * and it may only be passed via MS_OPTIONSTR, so we
3271 * must handle it here.
3272 *
3273 * Ideally, it would be in the list, but NFS does not specify its
3274 * own opt proto list, it uses instead the default one. Since
3275 * not all filesystems support extended attrs, it would not be
3276 * appropriate to add it there.
3277 */
3278 if (nfs4_trigger_add_mntopt(mntopts, MNTOPT_XATTR, vfsp) ||
3279 nfs4_trigger_add_mntopt(mntopts, MNTOPT_NOXATTR, vfsp)) {
3280 kmem_free(mntopts, MAX_MNTOPT_STR);
3281 return (NULL);
3282 }
3283
3284 return (mntopts);
3285 }
3286
3287 static void
nfs4_trigger_destroy_mntopts(char * mntopts)3288 nfs4_trigger_destroy_mntopts(char *mntopts)
3289 {
3290 if (mntopts)
3291 kmem_free(mntopts, MAX_MNTOPT_STR);
3292 }
3293
3294 /*
3295 * Check a single mount option (optname). Add to mntopts if it is set in VFS.
3296 */
3297 static int
nfs4_trigger_add_mntopt(char * mntopts,char * optname,vfs_t * vfsp)3298 nfs4_trigger_add_mntopt(char *mntopts, char *optname, vfs_t *vfsp)
3299 {
3300 if (mntopts == NULL || optname == NULL || vfsp == NULL)
3301 return (EINVAL);
3302
3303 if (vfs_optionisset(vfsp, optname, NULL)) {
3304 size_t mntoptslen = strlen(mntopts);
3305 size_t optnamelen = strlen(optname);
3306
3307 /* +1 for ',', +1 for NUL */
3308 if (mntoptslen + optnamelen + 2 > MAX_MNTOPT_STR)
3309 return (EOVERFLOW);
3310
3311 /* first or subsequent mount option? */
3312 if (*mntopts != '\0')
3313 (void) strcat(mntopts, ",");
3314
3315 (void) strcat(mntopts, optname);
3316 }
3317
3318 return (0);
3319 }
3320
3321 static enum clnt_stat
nfs4_ping_server_common(struct knetconfig * knc,struct netbuf * addr,int nointr)3322 nfs4_ping_server_common(struct knetconfig *knc, struct netbuf *addr, int nointr)
3323 {
3324 int retries;
3325 uint_t max_msgsize;
3326 enum clnt_stat status;
3327 CLIENT *cl;
3328 struct timeval timeout;
3329
3330 /* as per recov_newserver() */
3331 max_msgsize = 0;
3332 retries = 1;
3333 timeout.tv_sec = 2;
3334 timeout.tv_usec = 0;
3335
3336 if (clnt_tli_kcreate(knc, addr, NFS_PROGRAM, NFS_V4,
3337 max_msgsize, retries, CRED(), &cl) != 0)
3338 return (RPC_FAILED);
3339
3340 if (nointr)
3341 cl->cl_nosignal = TRUE;
3342 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL, xdr_void, NULL,
3343 timeout);
3344 if (nointr)
3345 cl->cl_nosignal = FALSE;
3346
3347 AUTH_DESTROY(cl->cl_auth);
3348 CLNT_DESTROY(cl);
3349
3350 return (status);
3351 }
3352
3353 static enum clnt_stat
nfs4_trigger_ping_server(servinfo4_t * svp,int nointr)3354 nfs4_trigger_ping_server(servinfo4_t *svp, int nointr)
3355 {
3356 return (nfs4_ping_server_common(svp->sv_knconf, &svp->sv_addr, nointr));
3357 }
3358