1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/param.h>
27 #include <sys/types.h>
28 #include <sys/systm.h>
29 #include <sys/cred.h>
30 #include <sys/proc.h>
31 #include <sys/user.h>
32 #include <sys/time.h>
33 #include <sys/buf.h>
34 #include <sys/vfs.h>
35 #include <sys/vnode.h>
36 #include <sys/socket.h>
37 #include <sys/uio.h>
38 #include <sys/tiuser.h>
39 #include <sys/swap.h>
40 #include <sys/errno.h>
41 #include <sys/debug.h>
42 #include <sys/kmem.h>
43 #include <sys/kstat.h>
44 #include <sys/cmn_err.h>
45 #include <sys/vtrace.h>
46 #include <sys/session.h>
47 #include <sys/dnlc.h>
48 #include <sys/bitmap.h>
49 #include <sys/acl.h>
50 #include <sys/ddi.h>
51 #include <sys/pathname.h>
52 #include <sys/flock.h>
53 #include <sys/dirent.h>
54 #include <sys/flock.h>
55 #include <sys/callb.h>
56 #include <sys/atomic.h>
57 #include <sys/list.h>
58 #include <sys/tsol/tnet.h>
59 #include <sys/priv.h>
60 #include <sys/sdt.h>
61 #include <sys/attr.h>
62
63 #include <inet/ip6.h>
64
65 #include <rpc/types.h>
66 #include <rpc/xdr.h>
67 #include <rpc/auth.h>
68 #include <rpc/clnt.h>
69
70 #include <nfs/nfs.h>
71 #include <nfs/nfs4.h>
72 #include <nfs/nfs_clnt.h>
73 #include <nfs/rnode.h>
74 #include <nfs/nfs_acl.h>
75
76 #include <sys/tsol/label.h>
77
78 /*
79 * The hash queues for the access to active and cached rnodes
80 * are organized as doubly linked lists. A reader/writer lock
81 * for each hash bucket is used to control access and to synchronize
82 * lookups, additions, and deletions from the hash queue.
83 *
84 * The rnode freelist is organized as a doubly linked list with
85 * a head pointer. Additions and deletions are synchronized via
86 * a single mutex.
87 *
88 * In order to add an rnode to the free list, it must be hashed into
89 * a hash queue and the exclusive lock to the hash queue be held.
90 * If an rnode is not hashed into a hash queue, then it is destroyed
91 * because it represents no valuable information that can be reused
92 * about the file. The exclusive lock to the hash queue must be
93 * held in order to prevent a lookup in the hash queue from finding
94 * the rnode and using it and assuming that the rnode is not on the
95 * freelist. The lookup in the hash queue will have the hash queue
96 * locked, either exclusive or shared.
97 *
98 * The vnode reference count for each rnode is not allowed to drop
99 * below 1. This prevents external entities, such as the VM
100 * subsystem, from acquiring references to vnodes already on the
101 * freelist and then trying to place them back on the freelist
102 * when their reference is released. This means that the when an
103 * rnode is looked up in the hash queues, then either the rnode
104 * is removed from the freelist and that reference is transferred to
105 * the new reference or the vnode reference count must be incremented
106 * accordingly. The mutex for the freelist must be held in order to
107 * accurately test to see if the rnode is on the freelist or not.
108 * The hash queue lock might be held shared and it is possible that
109 * two different threads may race to remove the rnode from the
110 * freelist. This race can be resolved by holding the mutex for the
111 * freelist. Please note that the mutex for the freelist does not
112 * need to held if the rnode is not on the freelist. It can not be
113 * placed on the freelist due to the requirement that the thread
114 * putting the rnode on the freelist must hold the exclusive lock
115 * to the hash queue and the thread doing the lookup in the hash
116 * queue is holding either a shared or exclusive lock to the hash
117 * queue.
118 *
119 * The lock ordering is:
120 *
121 * hash bucket lock -> vnode lock
122 * hash bucket lock -> freelist lock
123 */
124 static rhashq_t *rtable;
125
126 static kmutex_t rpfreelist_lock;
127 static rnode_t *rpfreelist = NULL;
128 static long rnew = 0;
129 long nrnode = 0;
130
131 static int rtablesize;
132 static int rtablemask;
133
134 static int hashlen = 4;
135
136 static struct kmem_cache *rnode_cache;
137
138 /*
139 * Mutex to protect the following variables:
140 * nfs_major
141 * nfs_minor
142 */
143 kmutex_t nfs_minor_lock;
144 int nfs_major;
145 int nfs_minor;
146
147 /* Do we allow preepoch (negative) time values otw? */
148 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
149
150 /*
151 * Access cache
152 */
153 static acache_hash_t *acache;
154 static long nacache; /* used strictly to size the number of hash queues */
155
156 static int acachesize;
157 static int acachemask;
158 static struct kmem_cache *acache_cache;
159
160 /*
161 * Client side utilities
162 */
163
164 /*
165 * client side statistics
166 */
167 static const struct clstat clstat_tmpl = {
168 { "calls", KSTAT_DATA_UINT64 },
169 { "badcalls", KSTAT_DATA_UINT64 },
170 { "clgets", KSTAT_DATA_UINT64 },
171 { "cltoomany", KSTAT_DATA_UINT64 },
172 #ifdef DEBUG
173 { "clalloc", KSTAT_DATA_UINT64 },
174 { "noresponse", KSTAT_DATA_UINT64 },
175 { "failover", KSTAT_DATA_UINT64 },
176 { "remap", KSTAT_DATA_UINT64 },
177 #endif
178 };
179
180 /*
181 * The following are statistics that describe behavior of the system as a whole
182 * and doesn't correspond to any one particular zone.
183 */
184 #ifdef DEBUG
185 static struct clstat_debug {
186 kstat_named_t nrnode; /* number of allocated rnodes */
187 kstat_named_t access; /* size of access cache */
188 kstat_named_t dirent; /* size of readdir cache */
189 kstat_named_t dirents; /* size of readdir buf cache */
190 kstat_named_t reclaim; /* number of reclaims */
191 kstat_named_t clreclaim; /* number of cl reclaims */
192 kstat_named_t f_reclaim; /* number of free reclaims */
193 kstat_named_t a_reclaim; /* number of active reclaims */
194 kstat_named_t r_reclaim; /* number of rnode reclaims */
195 kstat_named_t rpath; /* bytes used to store rpaths */
196 } clstat_debug = {
197 { "nrnode", KSTAT_DATA_UINT64 },
198 { "access", KSTAT_DATA_UINT64 },
199 { "dirent", KSTAT_DATA_UINT64 },
200 { "dirents", KSTAT_DATA_UINT64 },
201 { "reclaim", KSTAT_DATA_UINT64 },
202 { "clreclaim", KSTAT_DATA_UINT64 },
203 { "f_reclaim", KSTAT_DATA_UINT64 },
204 { "a_reclaim", KSTAT_DATA_UINT64 },
205 { "r_reclaim", KSTAT_DATA_UINT64 },
206 { "r_path", KSTAT_DATA_UINT64 },
207 };
208 #endif /* DEBUG */
209
210 /*
211 * We keep a global list of per-zone client data, so we can clean up all zones
212 * if we get low on memory.
213 */
214 static list_t nfs_clnt_list;
215 static kmutex_t nfs_clnt_list_lock;
216 static zone_key_t nfsclnt_zone_key;
217
218 static struct kmem_cache *chtab_cache;
219
220 /*
221 * Some servers do not properly update the attributes of the
222 * directory when changes are made. To allow interoperability
223 * with these broken servers, the nfs_disable_rddir_cache
224 * parameter must be set in /etc/system
225 */
226 int nfs_disable_rddir_cache = 0;
227
228 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
229 struct chtab **);
230 void clfree(CLIENT *, struct chtab *);
231 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
232 struct chtab **, struct nfs_clnt *);
233 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234 struct chtab **, struct nfs_clnt *);
235 static void clreclaim(void *);
236 static int nfs_feedback(int, int, mntinfo_t *);
237 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
238 caddr_t, cred_t *, int *, enum clnt_stat *, int,
239 failinfo_t *);
240 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
241 caddr_t, cred_t *, int *, int, failinfo_t *);
242 static void rinactive(rnode_t *, cred_t *);
243 static int rtablehash(nfs_fhandle *);
244 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
245 struct vnodeops *,
246 int (*)(vnode_t *, page_t *, u_offset_t *, size_t *, int,
247 cred_t *),
248 int (*)(const void *, const void *), int *, cred_t *,
249 char *, char *);
250 static void rp_rmfree(rnode_t *);
251 static void rp_addhash(rnode_t *);
252 static void rp_rmhash_locked(rnode_t *);
253 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
254 static void destroy_rnode(rnode_t *);
255 static void rddir_cache_free(rddir_cache *);
256 static int nfs_free_data_reclaim(rnode_t *);
257 static int nfs_active_data_reclaim(rnode_t *);
258 static int nfs_free_reclaim(void);
259 static int nfs_active_reclaim(void);
260 static int nfs_rnode_reclaim(void);
261 static void nfs_reclaim(void *);
262 static int failover_safe(failinfo_t *);
263 static void failover_newserver(mntinfo_t *mi);
264 static void failover_thread(mntinfo_t *mi);
265 static int failover_wait(mntinfo_t *);
266 static int failover_remap(failinfo_t *);
267 static int failover_lookup(char *, vnode_t *,
268 int (*)(vnode_t *, char *, vnode_t **,
269 struct pathname *, int, vnode_t *, cred_t *, int),
270 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
271 vnode_t **);
272 static void nfs_free_r_path(rnode_t *);
273 static void nfs_set_vroot(vnode_t *);
274 static char *nfs_getsrvnames(mntinfo_t *, size_t *);
275
276 /*
277 * from rpcsec module (common/rpcsec)
278 */
279 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
280 extern void sec_clnt_freeh(AUTH *);
281 extern void sec_clnt_freeinfo(struct sec_data *);
282
283 /*
284 * used in mount policy
285 */
286 extern ts_label_t *getflabel_cipso(vfs_t *);
287
288 /*
289 * EIO or EINTR are not recoverable errors.
290 */
291 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
292
293 #ifdef DEBUG
294 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n"
295 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
296 #else
297 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n"
298 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
299 #endif
300 /*
301 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
302 */
303 static int
clget_impl(clinfo_t * ci,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)304 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
305 struct chtab **chp, struct nfs_clnt *nfscl)
306 {
307 struct chhead *ch, *newch;
308 struct chhead **plistp;
309 struct chtab *cp;
310 int error;
311 k_sigset_t smask;
312
313 if (newcl == NULL || chp == NULL || ci == NULL)
314 return (EINVAL);
315
316 *newcl = NULL;
317 *chp = NULL;
318
319 /*
320 * Find an unused handle or create one
321 */
322 newch = NULL;
323 nfscl->nfscl_stat.clgets.value.ui64++;
324 top:
325 /*
326 * Find the correct entry in the cache to check for free
327 * client handles. The search is based on the RPC program
328 * number, program version number, dev_t for the transport
329 * device, and the protocol family.
330 */
331 mutex_enter(&nfscl->nfscl_chtable_lock);
332 plistp = &nfscl->nfscl_chtable;
333 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
334 if (ch->ch_prog == ci->cl_prog &&
335 ch->ch_vers == ci->cl_vers &&
336 ch->ch_dev == svp->sv_knconf->knc_rdev &&
337 (strcmp(ch->ch_protofmly,
338 svp->sv_knconf->knc_protofmly) == 0))
339 break;
340 plistp = &ch->ch_next;
341 }
342
343 /*
344 * If we didn't find a cache entry for this quadruple, then
345 * create one. If we don't have one already preallocated,
346 * then drop the cache lock, create one, and then start over.
347 * If we did have a preallocated entry, then just add it to
348 * the front of the list.
349 */
350 if (ch == NULL) {
351 if (newch == NULL) {
352 mutex_exit(&nfscl->nfscl_chtable_lock);
353 newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
354 newch->ch_timesused = 0;
355 newch->ch_prog = ci->cl_prog;
356 newch->ch_vers = ci->cl_vers;
357 newch->ch_dev = svp->sv_knconf->knc_rdev;
358 newch->ch_protofmly = kmem_alloc(
359 strlen(svp->sv_knconf->knc_protofmly) + 1,
360 KM_SLEEP);
361 (void) strcpy(newch->ch_protofmly,
362 svp->sv_knconf->knc_protofmly);
363 newch->ch_list = NULL;
364 goto top;
365 }
366 ch = newch;
367 newch = NULL;
368 ch->ch_next = nfscl->nfscl_chtable;
369 nfscl->nfscl_chtable = ch;
370 /*
371 * We found a cache entry, but if it isn't on the front of the
372 * list, then move it to the front of the list to try to take
373 * advantage of locality of operations.
374 */
375 } else if (ch != nfscl->nfscl_chtable) {
376 *plistp = ch->ch_next;
377 ch->ch_next = nfscl->nfscl_chtable;
378 nfscl->nfscl_chtable = ch;
379 }
380
381 /*
382 * If there was a free client handle cached, then remove it
383 * from the list, init it, and use it.
384 */
385 if (ch->ch_list != NULL) {
386 cp = ch->ch_list;
387 ch->ch_list = cp->ch_list;
388 mutex_exit(&nfscl->nfscl_chtable_lock);
389 if (newch != NULL) {
390 kmem_free(newch->ch_protofmly,
391 strlen(newch->ch_protofmly) + 1);
392 kmem_free(newch, sizeof (*newch));
393 }
394 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
395 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
396 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
397 &cp->ch_client->cl_auth);
398 if (error || cp->ch_client->cl_auth == NULL) {
399 CLNT_DESTROY(cp->ch_client);
400 kmem_cache_free(chtab_cache, cp);
401 return ((error != 0) ? error : EINTR);
402 }
403 ch->ch_timesused++;
404 *newcl = cp->ch_client;
405 *chp = cp;
406 return (0);
407 }
408
409 /*
410 * There weren't any free client handles which fit, so allocate
411 * a new one and use that.
412 */
413 #ifdef DEBUG
414 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
415 #endif
416 mutex_exit(&nfscl->nfscl_chtable_lock);
417
418 nfscl->nfscl_stat.cltoomany.value.ui64++;
419 if (newch != NULL) {
420 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
421 kmem_free(newch, sizeof (*newch));
422 }
423
424 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
425 cp->ch_head = ch;
426
427 sigintr(&smask, (int)ci->cl_flags & MI_INT);
428 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
429 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
430 sigunintr(&smask);
431
432 if (error != 0) {
433 kmem_cache_free(chtab_cache, cp);
434 #ifdef DEBUG
435 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
436 #endif
437 /*
438 * Warning is unnecessary if error is EINTR.
439 */
440 if (error != EINTR) {
441 nfs_cmn_err(error, CE_WARN,
442 "clget: couldn't create handle: %m\n");
443 }
444 return (error);
445 }
446 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
447 auth_destroy(cp->ch_client->cl_auth);
448 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
449 &cp->ch_client->cl_auth);
450 if (error || cp->ch_client->cl_auth == NULL) {
451 CLNT_DESTROY(cp->ch_client);
452 kmem_cache_free(chtab_cache, cp);
453 #ifdef DEBUG
454 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
455 #endif
456 return ((error != 0) ? error : EINTR);
457 }
458 ch->ch_timesused++;
459 *newcl = cp->ch_client;
460 ASSERT(cp->ch_client->cl_nosignal == FALSE);
461 *chp = cp;
462 return (0);
463 }
464
465 int
clget(clinfo_t * ci,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp)466 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
467 struct chtab **chp)
468 {
469 struct nfs_clnt *nfscl;
470
471 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
472 ASSERT(nfscl != NULL);
473
474 return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
475 }
476
477 static int
acl_clget(mntinfo_t * mi,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)478 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
479 struct chtab **chp, struct nfs_clnt *nfscl)
480 {
481 clinfo_t ci;
482 int error;
483
484 /*
485 * Set read buffer size to rsize
486 * and add room for RPC headers.
487 */
488 ci.cl_readsize = mi->mi_tsize;
489 if (ci.cl_readsize != 0)
490 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
491
492 /*
493 * If soft mount and server is down just try once.
494 * meaning: do not retransmit.
495 */
496 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
497 ci.cl_retrans = 0;
498 else
499 ci.cl_retrans = mi->mi_retrans;
500
501 ci.cl_prog = NFS_ACL_PROGRAM;
502 ci.cl_vers = mi->mi_vers;
503 ci.cl_flags = mi->mi_flags;
504
505 /*
506 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
507 * security flavor, the client tries to establish a security context
508 * by contacting the server. If the connection is timed out or reset,
509 * e.g. server reboot, we will try again.
510 */
511 do {
512 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
513
514 if (error == 0)
515 break;
516
517 /*
518 * For forced unmount or zone shutdown, bail out, no retry.
519 */
520 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
521 error = EIO;
522 break;
523 }
524
525 /* do not retry for softmount */
526 if (!(mi->mi_flags & MI_HARD))
527 break;
528
529 /* let the caller deal with the failover case */
530 if (FAILOVER_MOUNT(mi))
531 break;
532
533 } while (error == ETIMEDOUT || error == ECONNRESET);
534
535 return (error);
536 }
537
538 static int
nfs_clget(mntinfo_t * mi,servinfo_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs_clnt * nfscl)539 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
540 struct chtab **chp, struct nfs_clnt *nfscl)
541 {
542 clinfo_t ci;
543 int error;
544
545 /*
546 * Set read buffer size to rsize
547 * and add room for RPC headers.
548 */
549 ci.cl_readsize = mi->mi_tsize;
550 if (ci.cl_readsize != 0)
551 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
552
553 /*
554 * If soft mount and server is down just try once.
555 * meaning: do not retransmit.
556 */
557 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
558 ci.cl_retrans = 0;
559 else
560 ci.cl_retrans = mi->mi_retrans;
561
562 ci.cl_prog = mi->mi_prog;
563 ci.cl_vers = mi->mi_vers;
564 ci.cl_flags = mi->mi_flags;
565
566 /*
567 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
568 * security flavor, the client tries to establish a security context
569 * by contacting the server. If the connection is timed out or reset,
570 * e.g. server reboot, we will try again.
571 */
572 do {
573 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
574
575 if (error == 0)
576 break;
577
578 /*
579 * For forced unmount or zone shutdown, bail out, no retry.
580 */
581 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
582 error = EIO;
583 break;
584 }
585
586 /* do not retry for softmount */
587 if (!(mi->mi_flags & MI_HARD))
588 break;
589
590 /* let the caller deal with the failover case */
591 if (FAILOVER_MOUNT(mi))
592 break;
593
594 } while (error == ETIMEDOUT || error == ECONNRESET);
595
596 return (error);
597 }
598
599 static void
clfree_impl(CLIENT * cl,struct chtab * cp,struct nfs_clnt * nfscl)600 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
601 {
602 if (cl->cl_auth != NULL) {
603 sec_clnt_freeh(cl->cl_auth);
604 cl->cl_auth = NULL;
605 }
606
607 /*
608 * Timestamp this cache entry so that we know when it was last
609 * used.
610 */
611 cp->ch_freed = gethrestime_sec();
612
613 /*
614 * Add the free client handle to the front of the list.
615 * This way, the list will be sorted in youngest to oldest
616 * order.
617 */
618 mutex_enter(&nfscl->nfscl_chtable_lock);
619 cp->ch_list = cp->ch_head->ch_list;
620 cp->ch_head->ch_list = cp;
621 mutex_exit(&nfscl->nfscl_chtable_lock);
622 }
623
624 void
clfree(CLIENT * cl,struct chtab * cp)625 clfree(CLIENT *cl, struct chtab *cp)
626 {
627 struct nfs_clnt *nfscl;
628
629 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
630 ASSERT(nfscl != NULL);
631
632 clfree_impl(cl, cp, nfscl);
633 }
634
635 #define CL_HOLDTIME 60 /* time to hold client handles */
636
637 static void
clreclaim_zone(struct nfs_clnt * nfscl,uint_t cl_holdtime)638 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
639 {
640 struct chhead *ch;
641 struct chtab *cp; /* list of objects that can be reclaimed */
642 struct chtab *cpe;
643 struct chtab *cpl;
644 struct chtab **cpp;
645 #ifdef DEBUG
646 int n = 0;
647 #endif
648
649 /*
650 * Need to reclaim some memory, so step through the cache
651 * looking through the lists for entries which can be freed.
652 */
653 cp = NULL;
654
655 mutex_enter(&nfscl->nfscl_chtable_lock);
656
657 /*
658 * Here we step through each non-NULL quadruple and start to
659 * construct the reclaim list pointed to by cp. Note that
660 * cp will contain all eligible chtab entries. When this traversal
661 * completes, chtab entries from the last quadruple will be at the
662 * front of cp and entries from previously inspected quadruples have
663 * been appended to the rear of cp.
664 */
665 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
666 if (ch->ch_list == NULL)
667 continue;
668 /*
669 * Search each list for entries older then
670 * cl_holdtime seconds. The lists are maintained
671 * in youngest to oldest order so that when the
672 * first entry is found which is old enough, then
673 * all of the rest of the entries on the list will
674 * be old enough as well.
675 */
676 cpl = ch->ch_list;
677 cpp = &ch->ch_list;
678 while (cpl != NULL &&
679 cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
680 cpp = &cpl->ch_list;
681 cpl = cpl->ch_list;
682 }
683 if (cpl != NULL) {
684 *cpp = NULL;
685 if (cp != NULL) {
686 cpe = cpl;
687 while (cpe->ch_list != NULL)
688 cpe = cpe->ch_list;
689 cpe->ch_list = cp;
690 }
691 cp = cpl;
692 }
693 }
694
695 mutex_exit(&nfscl->nfscl_chtable_lock);
696
697 /*
698 * If cp is empty, then there is nothing to reclaim here.
699 */
700 if (cp == NULL)
701 return;
702
703 /*
704 * Step through the list of entries to free, destroying each client
705 * handle and kmem_free'ing the memory for each entry.
706 */
707 while (cp != NULL) {
708 #ifdef DEBUG
709 n++;
710 #endif
711 CLNT_DESTROY(cp->ch_client);
712 cpl = cp->ch_list;
713 kmem_cache_free(chtab_cache, cp);
714 cp = cpl;
715 }
716
717 #ifdef DEBUG
718 /*
719 * Update clalloc so that nfsstat shows the current number
720 * of allocated client handles.
721 */
722 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
723 #endif
724 }
725
726 /* ARGSUSED */
727 static void
clreclaim(void * all)728 clreclaim(void *all)
729 {
730 struct nfs_clnt *nfscl;
731
732 #ifdef DEBUG
733 clstat_debug.clreclaim.value.ui64++;
734 #endif
735 /*
736 * The system is low on memory; go through and try to reclaim some from
737 * every zone on the system.
738 */
739 mutex_enter(&nfs_clnt_list_lock);
740 nfscl = list_head(&nfs_clnt_list);
741 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
742 clreclaim_zone(nfscl, CL_HOLDTIME);
743 mutex_exit(&nfs_clnt_list_lock);
744 }
745
746 /*
747 * Minimum time-out values indexed by call type
748 * These units are in "eights" of a second to avoid multiplies
749 */
750 static unsigned int minimum_timeo[] = {
751 6, 7, 10
752 };
753
754 /*
755 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
756 */
757 #define MAXTIMO (20*hz)
758 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
759 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
760
761 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
762 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
763 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
764
765 /*
766 * Function called when rfscall notices that we have been
767 * re-transmitting, or when we get a response without retransmissions.
768 * Return 1 if the transfer size was adjusted down - 0 if no change.
769 */
770 static int
nfs_feedback(int flag,int which,mntinfo_t * mi)771 nfs_feedback(int flag, int which, mntinfo_t *mi)
772 {
773 int kind;
774 int r = 0;
775
776 mutex_enter(&mi->mi_lock);
777 if (flag == FEEDBACK_REXMIT1) {
778 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
779 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
780 goto done;
781 if (mi->mi_curread > MIN_NFS_TSIZE) {
782 mi->mi_curread /= 2;
783 if (mi->mi_curread < MIN_NFS_TSIZE)
784 mi->mi_curread = MIN_NFS_TSIZE;
785 r = 1;
786 }
787
788 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
789 mi->mi_curwrite /= 2;
790 if (mi->mi_curwrite < MIN_NFS_TSIZE)
791 mi->mi_curwrite = MIN_NFS_TSIZE;
792 r = 1;
793 }
794 } else if (flag == FEEDBACK_OK) {
795 kind = mi->mi_timer_type[which];
796 if (kind == 0 ||
797 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
798 goto done;
799 if (kind == 1) {
800 if (mi->mi_curread >= mi->mi_tsize)
801 goto done;
802 mi->mi_curread += MIN_NFS_TSIZE;
803 if (mi->mi_curread > mi->mi_tsize/2)
804 mi->mi_curread = mi->mi_tsize;
805 } else if (kind == 2) {
806 if (mi->mi_curwrite >= mi->mi_stsize)
807 goto done;
808 mi->mi_curwrite += MIN_NFS_TSIZE;
809 if (mi->mi_curwrite > mi->mi_stsize/2)
810 mi->mi_curwrite = mi->mi_stsize;
811 }
812 }
813 done:
814 mutex_exit(&mi->mi_lock);
815 return (r);
816 }
817
818 #ifdef DEBUG
819 static int rfs2call_hits = 0;
820 static int rfs2call_misses = 0;
821 #endif
822
823 int
rfs2call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,enum nfsstat * statusp,int flags,failinfo_t * fi)824 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
825 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
826 enum nfsstat *statusp, int flags, failinfo_t *fi)
827 {
828 int rpcerror;
829 enum clnt_stat rpc_status;
830
831 ASSERT(statusp != NULL);
832
833 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
834 cr, douprintf, &rpc_status, flags, fi);
835 if (!rpcerror) {
836 /*
837 * See crnetadjust() for comments.
838 */
839 if (*statusp == NFSERR_ACCES &&
840 (cr = crnetadjust(cr)) != NULL) {
841 #ifdef DEBUG
842 rfs2call_hits++;
843 #endif
844 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
845 resp, cr, douprintf, NULL, flags, fi);
846 crfree(cr);
847 #ifdef DEBUG
848 if (*statusp == NFSERR_ACCES)
849 rfs2call_misses++;
850 #endif
851 }
852 } else if (rpc_status == RPC_PROCUNAVAIL) {
853 *statusp = NFSERR_OPNOTSUPP;
854 rpcerror = 0;
855 }
856
857 return (rpcerror);
858 }
859
860 #define NFS3_JUKEBOX_DELAY 10 * hz
861
862 static clock_t nfs3_jukebox_delay = 0;
863
864 #ifdef DEBUG
865 static int rfs3call_hits = 0;
866 static int rfs3call_misses = 0;
867 #endif
868
869 int
rfs3call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,nfsstat3 * statusp,int flags,failinfo_t * fi)870 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
871 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
872 nfsstat3 *statusp, int flags, failinfo_t *fi)
873 {
874 int rpcerror;
875 int user_informed;
876
877 user_informed = 0;
878 do {
879 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
880 cr, douprintf, NULL, flags, fi);
881 if (!rpcerror) {
882 cred_t *crr;
883 if (*statusp == NFS3ERR_JUKEBOX) {
884 if (ttoproc(curthread) == &p0) {
885 rpcerror = EAGAIN;
886 break;
887 }
888 if (!user_informed) {
889 user_informed = 1;
890 uprintf(
891 "file temporarily unavailable on the server, retrying...\n");
892 }
893 delay(nfs3_jukebox_delay);
894 }
895 /*
896 * See crnetadjust() for comments.
897 */
898 else if (*statusp == NFS3ERR_ACCES &&
899 (crr = crnetadjust(cr)) != NULL) {
900 #ifdef DEBUG
901 rfs3call_hits++;
902 #endif
903 rpcerror = rfscall(mi, which, xdrargs, argsp,
904 xdrres, resp, crr, douprintf,
905 NULL, flags, fi);
906
907 crfree(crr);
908 #ifdef DEBUG
909 if (*statusp == NFS3ERR_ACCES)
910 rfs3call_misses++;
911 #endif
912 }
913 }
914 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
915
916 return (rpcerror);
917 }
918
919 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
920 #define INC_READERS(mi) { \
921 mi->mi_readers++; \
922 }
923 #define DEC_READERS(mi) { \
924 mi->mi_readers--; \
925 if (mi->mi_readers == 0) \
926 cv_broadcast(&mi->mi_failover_cv); \
927 }
928
929 static int
rfscall(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * douprintf,enum clnt_stat * rpc_status,int flags,failinfo_t * fi)930 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
931 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
932 enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
933 {
934 CLIENT *client;
935 struct chtab *ch;
936 cred_t *cr = icr;
937 enum clnt_stat status;
938 struct rpc_err rpcerr, rpcerr_tmp;
939 struct timeval wait;
940 int timeo; /* in units of hz */
941 int my_rsize, my_wsize;
942 bool_t tryagain;
943 bool_t cred_cloned = FALSE;
944 k_sigset_t smask;
945 servinfo_t *svp;
946 struct nfs_clnt *nfscl;
947 zoneid_t zoneid = getzoneid();
948 char *msg;
949 #ifdef DEBUG
950 char *bufp;
951 #endif
952
953
954 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
955 "rfscall_start:which %d mi %p", which, mi);
956
957 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
958 ASSERT(nfscl != NULL);
959
960 nfscl->nfscl_stat.calls.value.ui64++;
961 mi->mi_reqs[which].value.ui64++;
962
963 rpcerr.re_status = RPC_SUCCESS;
964
965 /*
966 * In case of forced unmount or zone shutdown, return EIO.
967 */
968
969 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
970 rpcerr.re_status = RPC_FAILED;
971 rpcerr.re_errno = EIO;
972 return (rpcerr.re_errno);
973 }
974
975 /*
976 * Remember the transfer sizes in case
977 * nfs_feedback changes them underneath us.
978 */
979 my_rsize = mi->mi_curread;
980 my_wsize = mi->mi_curwrite;
981
982 /*
983 * NFS client failover support
984 *
985 * If this rnode is not in sync with the current server (VALID_FH),
986 * we'd like to do a remap to get in sync. We can be interrupted
987 * in failover_remap(), and if so we'll bail. Otherwise, we'll
988 * use the best info we have to try the RPC. Part of that is
989 * unconditionally updating the filehandle copy kept for V3.
990 *
991 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
992 * rw_enter(); we're trying to keep the current server from being
993 * changed on us until we're done with the remapping and have a
994 * matching client handle. We don't want to sending a filehandle
995 * to the wrong host.
996 */
997 failoverretry:
998 if (FAILOVER_MOUNT(mi)) {
999 mutex_enter(&mi->mi_lock);
1000 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1001 if (failover_wait(mi)) {
1002 mutex_exit(&mi->mi_lock);
1003 return (EINTR);
1004 }
1005 }
1006 INC_READERS(mi);
1007 mutex_exit(&mi->mi_lock);
1008 if (fi) {
1009 if (!VALID_FH(fi) &&
1010 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1011 int remaperr;
1012
1013 svp = mi->mi_curr_serv;
1014 remaperr = failover_remap(fi);
1015 if (remaperr != 0) {
1016 #ifdef DEBUG
1017 if (remaperr != EINTR)
1018 nfs_cmn_err(remaperr, CE_WARN,
1019 "rfscall couldn't failover: %m");
1020 #endif
1021 mutex_enter(&mi->mi_lock);
1022 DEC_READERS(mi);
1023 mutex_exit(&mi->mi_lock);
1024 /*
1025 * If failover_remap returns ETIMEDOUT
1026 * and the filesystem is hard mounted
1027 * we have to retry the call with a new
1028 * server.
1029 */
1030 if ((mi->mi_flags & MI_HARD) &&
1031 IS_RECOVERABLE_ERROR(remaperr)) {
1032 if (svp == mi->mi_curr_serv)
1033 failover_newserver(mi);
1034 rpcerr.re_status = RPC_SUCCESS;
1035 goto failoverretry;
1036 }
1037 rpcerr.re_errno = remaperr;
1038 return (remaperr);
1039 }
1040 }
1041 if (fi->fhp && fi->copyproc)
1042 (*fi->copyproc)(fi->fhp, fi->vp);
1043 }
1044 }
1045
1046 /* For TSOL, use a new cred which has net_mac_aware flag */
1047 if (!cred_cloned && is_system_labeled()) {
1048 cred_cloned = TRUE;
1049 cr = crdup(icr);
1050 (void) setpflags(NET_MAC_AWARE, 1, cr);
1051 }
1052
1053 /*
1054 * clget() calls clnt_tli_kinit() which clears the xid, so we
1055 * are guaranteed to reprocess the retry as a new request.
1056 */
1057 svp = mi->mi_curr_serv;
1058 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1059
1060 if (FAILOVER_MOUNT(mi)) {
1061 mutex_enter(&mi->mi_lock);
1062 DEC_READERS(mi);
1063 mutex_exit(&mi->mi_lock);
1064
1065 if ((rpcerr.re_errno == ETIMEDOUT ||
1066 rpcerr.re_errno == ECONNRESET) &&
1067 failover_safe(fi)) {
1068 if (svp == mi->mi_curr_serv)
1069 failover_newserver(mi);
1070 goto failoverretry;
1071 }
1072 }
1073 if (rpcerr.re_errno != 0)
1074 return (rpcerr.re_errno);
1075
1076 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1077 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1078 timeo = (mi->mi_timeo * hz) / 10;
1079 } else {
1080 mutex_enter(&mi->mi_lock);
1081 timeo = CLNT_SETTIMERS(client,
1082 &(mi->mi_timers[mi->mi_timer_type[which]]),
1083 &(mi->mi_timers[NFS_CALLTYPES]),
1084 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1085 (void (*)())NULL, (caddr_t)mi, 0);
1086 mutex_exit(&mi->mi_lock);
1087 }
1088
1089 /*
1090 * If hard mounted fs, retry call forever unless hard error occurs.
1091 */
1092 do {
1093 tryagain = FALSE;
1094
1095 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1096 status = RPC_FAILED;
1097 rpcerr.re_status = RPC_FAILED;
1098 rpcerr.re_errno = EIO;
1099 break;
1100 }
1101
1102 TICK_TO_TIMEVAL(timeo, &wait);
1103
1104 /*
1105 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1106 * and SIGTERM. (Preserving the existing masks).
1107 * Mask out SIGINT if mount option nointr is specified.
1108 */
1109 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1110 if (!(mi->mi_flags & MI_INT))
1111 client->cl_nosignal = TRUE;
1112
1113 /*
1114 * If there is a current signal, then don't bother
1115 * even trying to send out the request because we
1116 * won't be able to block waiting for the response.
1117 * Simply assume RPC_INTR and get on with it.
1118 */
1119 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1120 status = RPC_INTR;
1121 else {
1122 status = CLNT_CALL(client, which, xdrargs, argsp,
1123 xdrres, resp, wait);
1124 }
1125
1126 if (!(mi->mi_flags & MI_INT))
1127 client->cl_nosignal = FALSE;
1128 /*
1129 * restore original signal mask
1130 */
1131 sigunintr(&smask);
1132
1133 switch (status) {
1134 case RPC_SUCCESS:
1135 if ((mi->mi_flags & MI_DYNAMIC) &&
1136 mi->mi_timer_type[which] != 0 &&
1137 (mi->mi_curread != my_rsize ||
1138 mi->mi_curwrite != my_wsize))
1139 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1140 break;
1141
1142 case RPC_INTR:
1143 /*
1144 * There is no way to recover from this error,
1145 * even if mount option nointr is specified.
1146 * SIGKILL, for example, cannot be blocked.
1147 */
1148 rpcerr.re_status = RPC_INTR;
1149 rpcerr.re_errno = EINTR;
1150 break;
1151
1152 case RPC_UDERROR:
1153 /*
1154 * If the NFS server is local (vold) and
1155 * it goes away then we get RPC_UDERROR.
1156 * This is a retryable error, so we would
1157 * loop, so check to see if the specific
1158 * error was ECONNRESET, indicating that
1159 * target did not exist at all. If so,
1160 * return with RPC_PROGUNAVAIL and
1161 * ECONNRESET to indicate why.
1162 */
1163 CLNT_GETERR(client, &rpcerr);
1164 if (rpcerr.re_errno == ECONNRESET) {
1165 rpcerr.re_status = RPC_PROGUNAVAIL;
1166 rpcerr.re_errno = ECONNRESET;
1167 break;
1168 }
1169 /*FALLTHROUGH*/
1170
1171 default: /* probably RPC_TIMEDOUT */
1172 if (IS_UNRECOVERABLE_RPC(status))
1173 break;
1174
1175 /*
1176 * increment server not responding count
1177 */
1178 mutex_enter(&mi->mi_lock);
1179 mi->mi_noresponse++;
1180 mutex_exit(&mi->mi_lock);
1181 #ifdef DEBUG
1182 nfscl->nfscl_stat.noresponse.value.ui64++;
1183 #endif
1184
1185 if (!(mi->mi_flags & MI_HARD)) {
1186 if (!(mi->mi_flags & MI_SEMISOFT) ||
1187 (mi->mi_ss_call_type[which] == 0))
1188 break;
1189 }
1190
1191 /*
1192 * The call is in progress (over COTS).
1193 * Try the CLNT_CALL again, but don't
1194 * print a noisy error message.
1195 */
1196 if (status == RPC_INPROGRESS) {
1197 tryagain = TRUE;
1198 break;
1199 }
1200
1201 if (flags & RFSCALL_SOFT)
1202 break;
1203
1204 /*
1205 * On zone shutdown, just move on.
1206 */
1207 if (zone_status_get(curproc->p_zone) >=
1208 ZONE_IS_SHUTTING_DOWN) {
1209 rpcerr.re_status = RPC_FAILED;
1210 rpcerr.re_errno = EIO;
1211 break;
1212 }
1213
1214 /*
1215 * NFS client failover support
1216 *
1217 * If the current server just failed us, we'll
1218 * start the process of finding a new server.
1219 * After that, we can just retry.
1220 */
1221 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1222 if (svp == mi->mi_curr_serv)
1223 failover_newserver(mi);
1224 clfree_impl(client, ch, nfscl);
1225 goto failoverretry;
1226 }
1227
1228 tryagain = TRUE;
1229 timeo = backoff(timeo);
1230
1231 CLNT_GETERR(client, &rpcerr_tmp);
1232 if ((status == RPC_CANTSEND) &&
1233 (rpcerr_tmp.re_errno == ENOBUFS))
1234 msg = SRV_QFULL_MSG;
1235 else
1236 msg = SRV_NOTRESP_MSG;
1237
1238 mutex_enter(&mi->mi_lock);
1239 if (!(mi->mi_flags & MI_PRINTED)) {
1240 mi->mi_flags |= MI_PRINTED;
1241 mutex_exit(&mi->mi_lock);
1242 #ifdef DEBUG
1243 zprintf(zoneid, msg, mi->mi_vers,
1244 svp->sv_hostname);
1245 #else
1246 zprintf(zoneid, msg, svp->sv_hostname);
1247 #endif
1248 } else
1249 mutex_exit(&mi->mi_lock);
1250 if (*douprintf && nfs_has_ctty()) {
1251 *douprintf = 0;
1252 if (!(mi->mi_flags & MI_NOPRINT))
1253 #ifdef DEBUG
1254 uprintf(msg, mi->mi_vers,
1255 svp->sv_hostname);
1256 #else
1257 uprintf(msg, svp->sv_hostname);
1258 #endif
1259 }
1260
1261 /*
1262 * If doing dynamic adjustment of transfer
1263 * size and if it's a read or write call
1264 * and if the transfer size changed while
1265 * retransmitting or if the feedback routine
1266 * changed the transfer size,
1267 * then exit rfscall so that the transfer
1268 * size can be adjusted at the vnops level.
1269 */
1270 if ((mi->mi_flags & MI_DYNAMIC) &&
1271 mi->mi_timer_type[which] != 0 &&
1272 (mi->mi_curread != my_rsize ||
1273 mi->mi_curwrite != my_wsize ||
1274 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1275 /*
1276 * On read or write calls, return
1277 * back to the vnode ops level if
1278 * the transfer size changed.
1279 */
1280 clfree_impl(client, ch, nfscl);
1281 if (cred_cloned)
1282 crfree(cr);
1283 return (ENFS_TRYAGAIN);
1284 }
1285 }
1286 } while (tryagain);
1287
1288 if (status != RPC_SUCCESS) {
1289 /*
1290 * Let soft mounts use the timed out message.
1291 */
1292 if (status == RPC_INPROGRESS)
1293 status = RPC_TIMEDOUT;
1294 nfscl->nfscl_stat.badcalls.value.ui64++;
1295 if (status != RPC_INTR) {
1296 mutex_enter(&mi->mi_lock);
1297 mi->mi_flags |= MI_DOWN;
1298 mutex_exit(&mi->mi_lock);
1299 CLNT_GETERR(client, &rpcerr);
1300 #ifdef DEBUG
1301 bufp = clnt_sperror(client, svp->sv_hostname);
1302 zprintf(zoneid, "NFS%d %s failed for %s\n",
1303 mi->mi_vers, mi->mi_rfsnames[which], bufp);
1304 if (nfs_has_ctty()) {
1305 if (!(mi->mi_flags & MI_NOPRINT)) {
1306 uprintf("NFS%d %s failed for %s\n",
1307 mi->mi_vers, mi->mi_rfsnames[which],
1308 bufp);
1309 }
1310 }
1311 kmem_free(bufp, MAXPATHLEN);
1312 #else
1313 zprintf(zoneid,
1314 "NFS %s failed for server %s: error %d (%s)\n",
1315 mi->mi_rfsnames[which], svp->sv_hostname,
1316 status, clnt_sperrno(status));
1317 if (nfs_has_ctty()) {
1318 if (!(mi->mi_flags & MI_NOPRINT)) {
1319 uprintf(
1320 "NFS %s failed for server %s: error %d (%s)\n",
1321 mi->mi_rfsnames[which],
1322 svp->sv_hostname, status,
1323 clnt_sperrno(status));
1324 }
1325 }
1326 #endif
1327 /*
1328 * when CLNT_CALL() fails with RPC_AUTHERROR,
1329 * re_errno is set appropriately depending on
1330 * the authentication error
1331 */
1332 if (status == RPC_VERSMISMATCH ||
1333 status == RPC_PROGVERSMISMATCH)
1334 rpcerr.re_errno = EIO;
1335 }
1336 } else {
1337 /*
1338 * Test the value of mi_down and mi_printed without
1339 * holding the mi_lock mutex. If they are both zero,
1340 * then it is okay to skip the down and printed
1341 * processing. This saves on a mutex_enter and
1342 * mutex_exit pair for a normal, successful RPC.
1343 * This was just complete overhead.
1344 */
1345 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1346 mutex_enter(&mi->mi_lock);
1347 mi->mi_flags &= ~MI_DOWN;
1348 if (mi->mi_flags & MI_PRINTED) {
1349 mi->mi_flags &= ~MI_PRINTED;
1350 mutex_exit(&mi->mi_lock);
1351 #ifdef DEBUG
1352 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1353 zprintf(zoneid, "NFS%d server %s ok\n",
1354 mi->mi_vers, svp->sv_hostname);
1355 #else
1356 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1357 zprintf(zoneid, "NFS server %s ok\n",
1358 svp->sv_hostname);
1359 #endif
1360 } else
1361 mutex_exit(&mi->mi_lock);
1362 }
1363
1364 if (*douprintf == 0) {
1365 if (!(mi->mi_flags & MI_NOPRINT))
1366 #ifdef DEBUG
1367 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1368 uprintf("NFS%d server %s ok\n",
1369 mi->mi_vers, svp->sv_hostname);
1370 #else
1371 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1372 uprintf("NFS server %s ok\n", svp->sv_hostname);
1373 #endif
1374 *douprintf = 1;
1375 }
1376 }
1377
1378 clfree_impl(client, ch, nfscl);
1379 if (cred_cloned)
1380 crfree(cr);
1381
1382 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1383
1384 if (rpc_status != NULL)
1385 *rpc_status = rpcerr.re_status;
1386
1387 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1388 rpcerr.re_errno);
1389
1390 return (rpcerr.re_errno);
1391 }
1392
1393 #ifdef DEBUG
1394 static int acl2call_hits = 0;
1395 static int acl2call_misses = 0;
1396 #endif
1397
1398 int
acl2call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,enum nfsstat * statusp,int flags,failinfo_t * fi)1399 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1400 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1401 enum nfsstat *statusp, int flags, failinfo_t *fi)
1402 {
1403 int rpcerror;
1404
1405 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1406 cr, douprintf, flags, fi);
1407 if (!rpcerror) {
1408 /*
1409 * See comments with crnetadjust().
1410 */
1411 if (*statusp == NFSERR_ACCES &&
1412 (cr = crnetadjust(cr)) != NULL) {
1413 #ifdef DEBUG
1414 acl2call_hits++;
1415 #endif
1416 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1417 resp, cr, douprintf, flags, fi);
1418 crfree(cr);
1419 #ifdef DEBUG
1420 if (*statusp == NFSERR_ACCES)
1421 acl2call_misses++;
1422 #endif
1423 }
1424 }
1425
1426 return (rpcerror);
1427 }
1428
1429 #ifdef DEBUG
1430 static int acl3call_hits = 0;
1431 static int acl3call_misses = 0;
1432 #endif
1433
1434 int
acl3call(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * cr,int * douprintf,nfsstat3 * statusp,int flags,failinfo_t * fi)1435 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1436 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1437 nfsstat3 *statusp, int flags, failinfo_t *fi)
1438 {
1439 int rpcerror;
1440 int user_informed;
1441
1442 user_informed = 0;
1443
1444 do {
1445 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1446 cr, douprintf, flags, fi);
1447 if (!rpcerror) {
1448 cred_t *crr;
1449 if (*statusp == NFS3ERR_JUKEBOX) {
1450 if (!user_informed) {
1451 user_informed = 1;
1452 uprintf(
1453 "file temporarily unavailable on the server, retrying...\n");
1454 }
1455 delay(nfs3_jukebox_delay);
1456 }
1457 /*
1458 * See crnetadjust() for comments.
1459 */
1460 else if (*statusp == NFS3ERR_ACCES &&
1461 (crr = crnetadjust(cr)) != NULL) {
1462 #ifdef DEBUG
1463 acl3call_hits++;
1464 #endif
1465 rpcerror = aclcall(mi, which, xdrargs, argsp,
1466 xdrres, resp, crr, douprintf, flags, fi);
1467
1468 crfree(crr);
1469 #ifdef DEBUG
1470 if (*statusp == NFS3ERR_ACCES)
1471 acl3call_misses++;
1472 #endif
1473 }
1474 }
1475 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1476
1477 return (rpcerror);
1478 }
1479
1480 static int
aclcall(mntinfo_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * douprintf,int flags,failinfo_t * fi)1481 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1482 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1483 int flags, failinfo_t *fi)
1484 {
1485 CLIENT *client;
1486 struct chtab *ch;
1487 cred_t *cr = icr;
1488 bool_t cred_cloned = FALSE;
1489 enum clnt_stat status;
1490 struct rpc_err rpcerr;
1491 struct timeval wait;
1492 int timeo; /* in units of hz */
1493 #if 0 /* notyet */
1494 int my_rsize, my_wsize;
1495 #endif
1496 bool_t tryagain;
1497 k_sigset_t smask;
1498 servinfo_t *svp;
1499 struct nfs_clnt *nfscl;
1500 zoneid_t zoneid = getzoneid();
1501 #ifdef DEBUG
1502 char *bufp;
1503 #endif
1504
1505 #if 0 /* notyet */
1506 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1507 "rfscall_start:which %d mi %p", which, mi);
1508 #endif
1509
1510 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1511 ASSERT(nfscl != NULL);
1512
1513 nfscl->nfscl_stat.calls.value.ui64++;
1514 mi->mi_aclreqs[which].value.ui64++;
1515
1516 rpcerr.re_status = RPC_SUCCESS;
1517
1518 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1519 rpcerr.re_status = RPC_FAILED;
1520 rpcerr.re_errno = EIO;
1521 return (rpcerr.re_errno);
1522 }
1523
1524 #if 0 /* notyet */
1525 /*
1526 * Remember the transfer sizes in case
1527 * nfs_feedback changes them underneath us.
1528 */
1529 my_rsize = mi->mi_curread;
1530 my_wsize = mi->mi_curwrite;
1531 #endif
1532
1533 /*
1534 * NFS client failover support
1535 *
1536 * If this rnode is not in sync with the current server (VALID_FH),
1537 * we'd like to do a remap to get in sync. We can be interrupted
1538 * in failover_remap(), and if so we'll bail. Otherwise, we'll
1539 * use the best info we have to try the RPC. Part of that is
1540 * unconditionally updating the filehandle copy kept for V3.
1541 *
1542 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1543 * rw_enter(); we're trying to keep the current server from being
1544 * changed on us until we're done with the remapping and have a
1545 * matching client handle. We don't want to sending a filehandle
1546 * to the wrong host.
1547 */
1548 failoverretry:
1549 if (FAILOVER_MOUNT(mi)) {
1550 mutex_enter(&mi->mi_lock);
1551 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1552 if (failover_wait(mi)) {
1553 mutex_exit(&mi->mi_lock);
1554 return (EINTR);
1555 }
1556 }
1557 INC_READERS(mi);
1558 mutex_exit(&mi->mi_lock);
1559 if (fi) {
1560 if (!VALID_FH(fi) &&
1561 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1562 int remaperr;
1563
1564 svp = mi->mi_curr_serv;
1565 remaperr = failover_remap(fi);
1566 if (remaperr != 0) {
1567 #ifdef DEBUG
1568 if (remaperr != EINTR)
1569 nfs_cmn_err(remaperr, CE_WARN,
1570 "aclcall couldn't failover: %m");
1571 #endif
1572 mutex_enter(&mi->mi_lock);
1573 DEC_READERS(mi);
1574 mutex_exit(&mi->mi_lock);
1575
1576 /*
1577 * If failover_remap returns ETIMEDOUT
1578 * and the filesystem is hard mounted
1579 * we have to retry the call with a new
1580 * server.
1581 */
1582 if ((mi->mi_flags & MI_HARD) &&
1583 IS_RECOVERABLE_ERROR(remaperr)) {
1584 if (svp == mi->mi_curr_serv)
1585 failover_newserver(mi);
1586 rpcerr.re_status = RPC_SUCCESS;
1587 goto failoverretry;
1588 }
1589 return (remaperr);
1590 }
1591 }
1592 if (fi->fhp && fi->copyproc)
1593 (*fi->copyproc)(fi->fhp, fi->vp);
1594 }
1595 }
1596
1597 /* For TSOL, use a new cred which has net_mac_aware flag */
1598 if (!cred_cloned && is_system_labeled()) {
1599 cred_cloned = TRUE;
1600 cr = crdup(icr);
1601 (void) setpflags(NET_MAC_AWARE, 1, cr);
1602 }
1603
1604 /*
1605 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1606 * are guaranteed to reprocess the retry as a new request.
1607 */
1608 svp = mi->mi_curr_serv;
1609 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1610 if (FAILOVER_MOUNT(mi)) {
1611 mutex_enter(&mi->mi_lock);
1612 DEC_READERS(mi);
1613 mutex_exit(&mi->mi_lock);
1614
1615 if ((rpcerr.re_errno == ETIMEDOUT ||
1616 rpcerr.re_errno == ECONNRESET) &&
1617 failover_safe(fi)) {
1618 if (svp == mi->mi_curr_serv)
1619 failover_newserver(mi);
1620 goto failoverretry;
1621 }
1622 }
1623 if (rpcerr.re_errno != 0) {
1624 if (cred_cloned)
1625 crfree(cr);
1626 return (rpcerr.re_errno);
1627 }
1628
1629 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1630 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1631 timeo = (mi->mi_timeo * hz) / 10;
1632 } else {
1633 mutex_enter(&mi->mi_lock);
1634 timeo = CLNT_SETTIMERS(client,
1635 &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1636 &(mi->mi_timers[NFS_CALLTYPES]),
1637 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1638 (void (*)()) 0, (caddr_t)mi, 0);
1639 mutex_exit(&mi->mi_lock);
1640 }
1641
1642 /*
1643 * If hard mounted fs, retry call forever unless hard error occurs.
1644 */
1645 do {
1646 tryagain = FALSE;
1647
1648 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1649 status = RPC_FAILED;
1650 rpcerr.re_status = RPC_FAILED;
1651 rpcerr.re_errno = EIO;
1652 break;
1653 }
1654
1655 TICK_TO_TIMEVAL(timeo, &wait);
1656
1657 /*
1658 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1659 * and SIGTERM. (Preserving the existing masks).
1660 * Mask out SIGINT if mount option nointr is specified.
1661 */
1662 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1663 if (!(mi->mi_flags & MI_INT))
1664 client->cl_nosignal = TRUE;
1665
1666 /*
1667 * If there is a current signal, then don't bother
1668 * even trying to send out the request because we
1669 * won't be able to block waiting for the response.
1670 * Simply assume RPC_INTR and get on with it.
1671 */
1672 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1673 status = RPC_INTR;
1674 else {
1675 status = CLNT_CALL(client, which, xdrargs, argsp,
1676 xdrres, resp, wait);
1677 }
1678
1679 if (!(mi->mi_flags & MI_INT))
1680 client->cl_nosignal = FALSE;
1681 /*
1682 * restore original signal mask
1683 */
1684 sigunintr(&smask);
1685
1686 switch (status) {
1687 case RPC_SUCCESS:
1688 #if 0 /* notyet */
1689 if ((mi->mi_flags & MI_DYNAMIC) &&
1690 mi->mi_timer_type[which] != 0 &&
1691 (mi->mi_curread != my_rsize ||
1692 mi->mi_curwrite != my_wsize))
1693 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1694 #endif
1695 break;
1696
1697 /*
1698 * Unfortunately, there are servers in the world which
1699 * are not coded correctly. They are not prepared to
1700 * handle RPC requests to the NFS port which are not
1701 * NFS requests. Thus, they may try to process the
1702 * NFS_ACL request as if it were an NFS request. This
1703 * does not work. Generally, an error will be generated
1704 * on the client because it will not be able to decode
1705 * the response from the server. However, it seems
1706 * possible that the server may not be able to decode
1707 * the arguments. Thus, the criteria for deciding
1708 * whether the server supports NFS_ACL or not is whether
1709 * the following RPC errors are returned from CLNT_CALL.
1710 */
1711 case RPC_CANTDECODERES:
1712 case RPC_PROGUNAVAIL:
1713 case RPC_CANTDECODEARGS:
1714 case RPC_PROGVERSMISMATCH:
1715 mutex_enter(&mi->mi_lock);
1716 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1717 mutex_exit(&mi->mi_lock);
1718 break;
1719
1720 /*
1721 * If the server supports NFS_ACL but not the new ops
1722 * for extended attributes, make sure we don't retry.
1723 */
1724 case RPC_PROCUNAVAIL:
1725 mutex_enter(&mi->mi_lock);
1726 mi->mi_flags &= ~MI_EXTATTR;
1727 mutex_exit(&mi->mi_lock);
1728 break;
1729
1730 case RPC_INTR:
1731 /*
1732 * There is no way to recover from this error,
1733 * even if mount option nointr is specified.
1734 * SIGKILL, for example, cannot be blocked.
1735 */
1736 rpcerr.re_status = RPC_INTR;
1737 rpcerr.re_errno = EINTR;
1738 break;
1739
1740 case RPC_UDERROR:
1741 /*
1742 * If the NFS server is local (vold) and
1743 * it goes away then we get RPC_UDERROR.
1744 * This is a retryable error, so we would
1745 * loop, so check to see if the specific
1746 * error was ECONNRESET, indicating that
1747 * target did not exist at all. If so,
1748 * return with RPC_PROGUNAVAIL and
1749 * ECONNRESET to indicate why.
1750 */
1751 CLNT_GETERR(client, &rpcerr);
1752 if (rpcerr.re_errno == ECONNRESET) {
1753 rpcerr.re_status = RPC_PROGUNAVAIL;
1754 rpcerr.re_errno = ECONNRESET;
1755 break;
1756 }
1757 /*FALLTHROUGH*/
1758
1759 default: /* probably RPC_TIMEDOUT */
1760 if (IS_UNRECOVERABLE_RPC(status))
1761 break;
1762
1763 /*
1764 * increment server not responding count
1765 */
1766 mutex_enter(&mi->mi_lock);
1767 mi->mi_noresponse++;
1768 mutex_exit(&mi->mi_lock);
1769 #ifdef DEBUG
1770 nfscl->nfscl_stat.noresponse.value.ui64++;
1771 #endif
1772
1773 if (!(mi->mi_flags & MI_HARD)) {
1774 if (!(mi->mi_flags & MI_SEMISOFT) ||
1775 (mi->mi_acl_ss_call_type[which] == 0))
1776 break;
1777 }
1778
1779 /*
1780 * The call is in progress (over COTS).
1781 * Try the CLNT_CALL again, but don't
1782 * print a noisy error message.
1783 */
1784 if (status == RPC_INPROGRESS) {
1785 tryagain = TRUE;
1786 break;
1787 }
1788
1789 if (flags & RFSCALL_SOFT)
1790 break;
1791
1792 /*
1793 * On zone shutdown, just move on.
1794 */
1795 if (zone_status_get(curproc->p_zone) >=
1796 ZONE_IS_SHUTTING_DOWN) {
1797 rpcerr.re_status = RPC_FAILED;
1798 rpcerr.re_errno = EIO;
1799 break;
1800 }
1801
1802 /*
1803 * NFS client failover support
1804 *
1805 * If the current server just failed us, we'll
1806 * start the process of finding a new server.
1807 * After that, we can just retry.
1808 */
1809 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1810 if (svp == mi->mi_curr_serv)
1811 failover_newserver(mi);
1812 clfree_impl(client, ch, nfscl);
1813 goto failoverretry;
1814 }
1815
1816 tryagain = TRUE;
1817 timeo = backoff(timeo);
1818 mutex_enter(&mi->mi_lock);
1819 if (!(mi->mi_flags & MI_PRINTED)) {
1820 mi->mi_flags |= MI_PRINTED;
1821 mutex_exit(&mi->mi_lock);
1822 #ifdef DEBUG
1823 zprintf(zoneid,
1824 "NFS_ACL%d server %s not responding still trying\n",
1825 mi->mi_vers, svp->sv_hostname);
1826 #else
1827 zprintf(zoneid,
1828 "NFS server %s not responding still trying\n",
1829 svp->sv_hostname);
1830 #endif
1831 } else
1832 mutex_exit(&mi->mi_lock);
1833 if (*douprintf && nfs_has_ctty()) {
1834 *douprintf = 0;
1835 if (!(mi->mi_flags & MI_NOPRINT))
1836 #ifdef DEBUG
1837 uprintf(
1838 "NFS_ACL%d server %s not responding still trying\n",
1839 mi->mi_vers, svp->sv_hostname);
1840 #else
1841 uprintf(
1842 "NFS server %s not responding still trying\n",
1843 svp->sv_hostname);
1844 #endif
1845 }
1846
1847 #if 0 /* notyet */
1848 /*
1849 * If doing dynamic adjustment of transfer
1850 * size and if it's a read or write call
1851 * and if the transfer size changed while
1852 * retransmitting or if the feedback routine
1853 * changed the transfer size,
1854 * then exit rfscall so that the transfer
1855 * size can be adjusted at the vnops level.
1856 */
1857 if ((mi->mi_flags & MI_DYNAMIC) &&
1858 mi->mi_acl_timer_type[which] != 0 &&
1859 (mi->mi_curread != my_rsize ||
1860 mi->mi_curwrite != my_wsize ||
1861 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1862 /*
1863 * On read or write calls, return
1864 * back to the vnode ops level if
1865 * the transfer size changed.
1866 */
1867 clfree_impl(client, ch, nfscl);
1868 if (cred_cloned)
1869 crfree(cr);
1870 return (ENFS_TRYAGAIN);
1871 }
1872 #endif
1873 }
1874 } while (tryagain);
1875
1876 if (status != RPC_SUCCESS) {
1877 /*
1878 * Let soft mounts use the timed out message.
1879 */
1880 if (status == RPC_INPROGRESS)
1881 status = RPC_TIMEDOUT;
1882 nfscl->nfscl_stat.badcalls.value.ui64++;
1883 if (status == RPC_CANTDECODERES ||
1884 status == RPC_PROGUNAVAIL ||
1885 status == RPC_PROCUNAVAIL ||
1886 status == RPC_CANTDECODEARGS ||
1887 status == RPC_PROGVERSMISMATCH)
1888 CLNT_GETERR(client, &rpcerr);
1889 else if (status != RPC_INTR) {
1890 mutex_enter(&mi->mi_lock);
1891 mi->mi_flags |= MI_DOWN;
1892 mutex_exit(&mi->mi_lock);
1893 CLNT_GETERR(client, &rpcerr);
1894 #ifdef DEBUG
1895 bufp = clnt_sperror(client, svp->sv_hostname);
1896 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1897 mi->mi_vers, mi->mi_aclnames[which], bufp);
1898 if (nfs_has_ctty()) {
1899 if (!(mi->mi_flags & MI_NOPRINT)) {
1900 uprintf("NFS_ACL%d %s failed for %s\n",
1901 mi->mi_vers, mi->mi_aclnames[which],
1902 bufp);
1903 }
1904 }
1905 kmem_free(bufp, MAXPATHLEN);
1906 #else
1907 zprintf(zoneid,
1908 "NFS %s failed for server %s: error %d (%s)\n",
1909 mi->mi_aclnames[which], svp->sv_hostname,
1910 status, clnt_sperrno(status));
1911 if (nfs_has_ctty()) {
1912 if (!(mi->mi_flags & MI_NOPRINT))
1913 uprintf(
1914 "NFS %s failed for server %s: error %d (%s)\n",
1915 mi->mi_aclnames[which],
1916 svp->sv_hostname, status,
1917 clnt_sperrno(status));
1918 }
1919 #endif
1920 /*
1921 * when CLNT_CALL() fails with RPC_AUTHERROR,
1922 * re_errno is set appropriately depending on
1923 * the authentication error
1924 */
1925 if (status == RPC_VERSMISMATCH ||
1926 status == RPC_PROGVERSMISMATCH)
1927 rpcerr.re_errno = EIO;
1928 }
1929 } else {
1930 /*
1931 * Test the value of mi_down and mi_printed without
1932 * holding the mi_lock mutex. If they are both zero,
1933 * then it is okay to skip the down and printed
1934 * processing. This saves on a mutex_enter and
1935 * mutex_exit pair for a normal, successful RPC.
1936 * This was just complete overhead.
1937 */
1938 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1939 mutex_enter(&mi->mi_lock);
1940 mi->mi_flags &= ~MI_DOWN;
1941 if (mi->mi_flags & MI_PRINTED) {
1942 mi->mi_flags &= ~MI_PRINTED;
1943 mutex_exit(&mi->mi_lock);
1944 #ifdef DEBUG
1945 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1946 mi->mi_vers, svp->sv_hostname);
1947 #else
1948 zprintf(zoneid, "NFS server %s ok\n",
1949 svp->sv_hostname);
1950 #endif
1951 } else
1952 mutex_exit(&mi->mi_lock);
1953 }
1954
1955 if (*douprintf == 0) {
1956 if (!(mi->mi_flags & MI_NOPRINT))
1957 #ifdef DEBUG
1958 uprintf("NFS_ACL%d server %s ok\n",
1959 mi->mi_vers, svp->sv_hostname);
1960 #else
1961 uprintf("NFS server %s ok\n", svp->sv_hostname);
1962 #endif
1963 *douprintf = 1;
1964 }
1965 }
1966
1967 clfree_impl(client, ch, nfscl);
1968 if (cred_cloned)
1969 crfree(cr);
1970
1971 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1972
1973 #if 0 /* notyet */
1974 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1975 rpcerr.re_errno);
1976 #endif
1977
1978 return (rpcerr.re_errno);
1979 }
1980
1981 int
vattr_to_sattr(struct vattr * vap,struct nfssattr * sa)1982 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1983 {
1984 uint_t mask = vap->va_mask;
1985
1986 if (!(mask & AT_MODE))
1987 sa->sa_mode = (uint32_t)-1;
1988 else
1989 sa->sa_mode = vap->va_mode;
1990 if (!(mask & AT_UID))
1991 sa->sa_uid = (uint32_t)-1;
1992 else
1993 sa->sa_uid = (uint32_t)vap->va_uid;
1994 if (!(mask & AT_GID))
1995 sa->sa_gid = (uint32_t)-1;
1996 else
1997 sa->sa_gid = (uint32_t)vap->va_gid;
1998 if (!(mask & AT_SIZE))
1999 sa->sa_size = (uint32_t)-1;
2000 else
2001 sa->sa_size = (uint32_t)vap->va_size;
2002 if (!(mask & AT_ATIME))
2003 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
2004 else {
2005 /* check time validity */
2006 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2007 return (EOVERFLOW);
2008 }
2009 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
2010 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2011 }
2012 if (!(mask & AT_MTIME))
2013 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
2014 else {
2015 /* check time validity */
2016 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2017 return (EOVERFLOW);
2018 }
2019 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2020 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2021 }
2022 return (0);
2023 }
2024
2025 int
vattr_to_sattr3(struct vattr * vap,sattr3 * sa)2026 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2027 {
2028 uint_t mask = vap->va_mask;
2029
2030 if (!(mask & AT_MODE))
2031 sa->mode.set_it = FALSE;
2032 else {
2033 sa->mode.set_it = TRUE;
2034 sa->mode.mode = (mode3)vap->va_mode;
2035 }
2036 if (!(mask & AT_UID))
2037 sa->uid.set_it = FALSE;
2038 else {
2039 sa->uid.set_it = TRUE;
2040 sa->uid.uid = (uid3)vap->va_uid;
2041 }
2042 if (!(mask & AT_GID))
2043 sa->gid.set_it = FALSE;
2044 else {
2045 sa->gid.set_it = TRUE;
2046 sa->gid.gid = (gid3)vap->va_gid;
2047 }
2048 if (!(mask & AT_SIZE))
2049 sa->size.set_it = FALSE;
2050 else {
2051 sa->size.set_it = TRUE;
2052 sa->size.size = (size3)vap->va_size;
2053 }
2054 if (!(mask & AT_ATIME))
2055 sa->atime.set_it = DONT_CHANGE;
2056 else {
2057 /* check time validity */
2058 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2059 return (EOVERFLOW);
2060 }
2061 sa->atime.set_it = SET_TO_CLIENT_TIME;
2062 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2063 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2064 }
2065 if (!(mask & AT_MTIME))
2066 sa->mtime.set_it = DONT_CHANGE;
2067 else {
2068 /* check time validity */
2069 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2070 return (EOVERFLOW);
2071 }
2072 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2073 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2074 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2075 }
2076 return (0);
2077 }
2078
2079 void
setdiropargs(struct nfsdiropargs * da,char * nm,vnode_t * dvp)2080 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2081 {
2082
2083 da->da_fhandle = VTOFH(dvp);
2084 da->da_name = nm;
2085 da->da_flags = 0;
2086 }
2087
2088 void
setdiropargs3(diropargs3 * da,char * nm,vnode_t * dvp)2089 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2090 {
2091
2092 da->dirp = VTOFH3(dvp);
2093 da->name = nm;
2094 }
2095
2096 int
setdirgid(vnode_t * dvp,gid_t * gidp,cred_t * cr)2097 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2098 {
2099 int error;
2100 rnode_t *rp;
2101 struct vattr va;
2102
2103 va.va_mask = AT_MODE | AT_GID;
2104 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2105 if (error)
2106 return (error);
2107
2108 /*
2109 * To determine the expected group-id of the created file:
2110 * 1) If the filesystem was not mounted with the Old-BSD-compatible
2111 * GRPID option, and the directory's set-gid bit is clear,
2112 * then use the process's gid.
2113 * 2) Otherwise, set the group-id to the gid of the parent directory.
2114 */
2115 rp = VTOR(dvp);
2116 mutex_enter(&rp->r_statelock);
2117 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2118 *gidp = crgetgid(cr);
2119 else
2120 *gidp = va.va_gid;
2121 mutex_exit(&rp->r_statelock);
2122 return (0);
2123 }
2124
2125 int
setdirmode(vnode_t * dvp,mode_t * omp,cred_t * cr)2126 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2127 {
2128 int error;
2129 struct vattr va;
2130
2131 va.va_mask = AT_MODE;
2132 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2133 if (error)
2134 return (error);
2135
2136 /*
2137 * Modify the expected mode (om) so that the set-gid bit matches
2138 * that of the parent directory (dvp).
2139 */
2140 if (va.va_mode & VSGID)
2141 *omp |= VSGID;
2142 else
2143 *omp &= ~VSGID;
2144 return (0);
2145 }
2146
2147 void
nfs_setswaplike(vnode_t * vp,vattr_t * vap)2148 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2149 {
2150
2151 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2152 if (!(vp->v_flag & VSWAPLIKE)) {
2153 mutex_enter(&vp->v_lock);
2154 vp->v_flag |= VSWAPLIKE;
2155 mutex_exit(&vp->v_lock);
2156 }
2157 } else {
2158 if (vp->v_flag & VSWAPLIKE) {
2159 mutex_enter(&vp->v_lock);
2160 vp->v_flag &= ~VSWAPLIKE;
2161 mutex_exit(&vp->v_lock);
2162 }
2163 }
2164 }
2165
2166 /*
2167 * Free the resources associated with an rnode.
2168 */
2169 static void
rinactive(rnode_t * rp,cred_t * cr)2170 rinactive(rnode_t *rp, cred_t *cr)
2171 {
2172 vnode_t *vp;
2173 cred_t *cred;
2174 char *contents;
2175 int size;
2176 vsecattr_t *vsp;
2177 int error;
2178 nfs3_pathconf_info *info;
2179
2180 /*
2181 * Before freeing anything, wait until all asynchronous
2182 * activity is done on this rnode. This will allow all
2183 * asynchronous read ahead and write behind i/o's to
2184 * finish.
2185 */
2186 mutex_enter(&rp->r_statelock);
2187 while (rp->r_count > 0)
2188 cv_wait(&rp->r_cv, &rp->r_statelock);
2189 mutex_exit(&rp->r_statelock);
2190
2191 /*
2192 * Flush and invalidate all pages associated with the vnode.
2193 */
2194 vp = RTOV(rp);
2195 if (vn_has_cached_data(vp)) {
2196 ASSERT(vp->v_type != VCHR);
2197 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2198 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, 0, cr, NULL);
2199 if (error && (error == ENOSPC || error == EDQUOT)) {
2200 mutex_enter(&rp->r_statelock);
2201 if (!rp->r_error)
2202 rp->r_error = error;
2203 mutex_exit(&rp->r_statelock);
2204 }
2205 }
2206 nfs_invalidate_pages(vp, (u_offset_t)0, cr);
2207 }
2208
2209 /*
2210 * Free any held credentials and caches which may be associated
2211 * with this rnode.
2212 */
2213 mutex_enter(&rp->r_statelock);
2214 cred = rp->r_cred;
2215 rp->r_cred = NULL;
2216 contents = rp->r_symlink.contents;
2217 size = rp->r_symlink.size;
2218 rp->r_symlink.contents = NULL;
2219 vsp = rp->r_secattr;
2220 rp->r_secattr = NULL;
2221 info = rp->r_pathconf;
2222 rp->r_pathconf = NULL;
2223 mutex_exit(&rp->r_statelock);
2224
2225 /*
2226 * Free the held credential.
2227 */
2228 if (cred != NULL)
2229 crfree(cred);
2230
2231 /*
2232 * Free the access cache entries.
2233 */
2234 (void) nfs_access_purge_rp(rp);
2235
2236 /*
2237 * Free the readdir cache entries.
2238 */
2239 if (HAVE_RDDIR_CACHE(rp))
2240 nfs_purge_rddir_cache(vp);
2241
2242 /*
2243 * Free the symbolic link cache.
2244 */
2245 if (contents != NULL) {
2246
2247 kmem_free((void *)contents, size);
2248 }
2249
2250 /*
2251 * Free any cached ACL.
2252 */
2253 if (vsp != NULL)
2254 nfs_acl_free(vsp);
2255
2256 /*
2257 * Free any cached pathconf information.
2258 */
2259 if (info != NULL)
2260 kmem_free(info, sizeof (*info));
2261 }
2262
2263 /*
2264 * Return a vnode for the given NFS Version 2 file handle.
2265 * If no rnode exists for this fhandle, create one and put it
2266 * into the hash queues. If the rnode for this fhandle
2267 * already exists, return it.
2268 *
2269 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2270 */
2271 vnode_t *
makenfsnode(fhandle_t * fh,struct nfsfattr * attr,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2272 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2273 hrtime_t t, cred_t *cr, char *dnm, char *nm)
2274 {
2275 int newnode;
2276 int index;
2277 vnode_t *vp;
2278 nfs_fhandle nfh;
2279 vattr_t va;
2280
2281 nfh.fh_len = NFS_FHSIZE;
2282 bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2283
2284 index = rtablehash(&nfh);
2285 rw_enter(&rtable[index].r_lock, RW_READER);
2286
2287 vp = make_rnode(&nfh, &rtable[index], vfsp, nfs_vnodeops,
2288 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2289
2290 if (attr != NULL) {
2291 if (!newnode) {
2292 rw_exit(&rtable[index].r_lock);
2293 (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2294 } else {
2295 if (attr->na_type < NFNON || attr->na_type > NFSOC)
2296 vp->v_type = VBAD;
2297 else
2298 vp->v_type = n2v_type(attr);
2299 /*
2300 * A translation here seems to be necessary
2301 * because this function can be called
2302 * with `attr' that has come from the wire,
2303 * and been operated on by vattr_to_nattr().
2304 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2305 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2306 * ->makenfsnode().
2307 */
2308 if ((attr->na_rdev & 0xffff0000) == 0)
2309 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2310 else
2311 vp->v_rdev = expldev(n2v_rdev(attr));
2312 nfs_attrcache(vp, attr, t);
2313 rw_exit(&rtable[index].r_lock);
2314 }
2315 } else {
2316 if (newnode) {
2317 PURGE_ATTRCACHE(vp);
2318 }
2319 rw_exit(&rtable[index].r_lock);
2320 }
2321
2322 return (vp);
2323 }
2324
2325 /*
2326 * Return a vnode for the given NFS Version 3 file handle.
2327 * If no rnode exists for this fhandle, create one and put it
2328 * into the hash queues. If the rnode for this fhandle
2329 * already exists, return it.
2330 *
2331 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2332 */
2333 vnode_t *
makenfs3node_va(nfs_fh3 * fh,vattr_t * vap,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2334 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2335 cred_t *cr, char *dnm, char *nm)
2336 {
2337 int newnode;
2338 int index;
2339 vnode_t *vp;
2340
2341 index = rtablehash((nfs_fhandle *)fh);
2342 rw_enter(&rtable[index].r_lock, RW_READER);
2343
2344 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2345 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2346 dnm, nm);
2347
2348 if (vap == NULL) {
2349 if (newnode) {
2350 PURGE_ATTRCACHE(vp);
2351 }
2352 rw_exit(&rtable[index].r_lock);
2353 return (vp);
2354 }
2355
2356 if (!newnode) {
2357 rw_exit(&rtable[index].r_lock);
2358 nfs_attr_cache(vp, vap, t, cr);
2359 } else {
2360 rnode_t *rp = VTOR(vp);
2361
2362 vp->v_type = vap->va_type;
2363 vp->v_rdev = vap->va_rdev;
2364
2365 mutex_enter(&rp->r_statelock);
2366 if (rp->r_mtime <= t)
2367 nfs_attrcache_va(vp, vap);
2368 mutex_exit(&rp->r_statelock);
2369 rw_exit(&rtable[index].r_lock);
2370 }
2371
2372 return (vp);
2373 }
2374
2375 vnode_t *
makenfs3node(nfs_fh3 * fh,fattr3 * attr,struct vfs * vfsp,hrtime_t t,cred_t * cr,char * dnm,char * nm)2376 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2377 cred_t *cr, char *dnm, char *nm)
2378 {
2379 int newnode;
2380 int index;
2381 vnode_t *vp;
2382 vattr_t va;
2383
2384 index = rtablehash((nfs_fhandle *)fh);
2385 rw_enter(&rtable[index].r_lock, RW_READER);
2386
2387 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2388 nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2389 dnm, nm);
2390
2391 if (attr == NULL) {
2392 if (newnode) {
2393 PURGE_ATTRCACHE(vp);
2394 }
2395 rw_exit(&rtable[index].r_lock);
2396 return (vp);
2397 }
2398
2399 if (!newnode) {
2400 rw_exit(&rtable[index].r_lock);
2401 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2402 } else {
2403 if (attr->type < NF3REG || attr->type > NF3FIFO)
2404 vp->v_type = VBAD;
2405 else
2406 vp->v_type = nf3_to_vt[attr->type];
2407 vp->v_rdev = makedevice(attr->rdev.specdata1,
2408 attr->rdev.specdata2);
2409 nfs3_attrcache(vp, attr, t);
2410 rw_exit(&rtable[index].r_lock);
2411 }
2412
2413 return (vp);
2414 }
2415
2416 /*
2417 * Read this comment before making changes to rtablehash()!
2418 * This is a hash function in which seemingly obvious and harmless
2419 * changes can cause escalations costing million dollars!
2420 * Know what you are doing.
2421 *
2422 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
2423 * algorithm is currently detailed here:
2424 *
2425 * http://burtleburtle.net/bob/hash/doobs.html
2426 *
2427 * Of course, the above link may not be valid by the time you are reading
2428 * this, but suffice it to say that the one-at-a-time algorithm works well in
2429 * almost all cases. If you are changing the algorithm be sure to verify that
2430 * the hash algorithm still provides even distribution in all cases and with
2431 * any server returning filehandles in whatever order (sequential or random).
2432 */
2433 static int
rtablehash(nfs_fhandle * fh)2434 rtablehash(nfs_fhandle *fh)
2435 {
2436 ulong_t hash, len, i;
2437 char *key;
2438
2439 key = fh->fh_buf;
2440 len = (ulong_t)fh->fh_len;
2441 for (hash = 0, i = 0; i < len; i++) {
2442 hash += key[i];
2443 hash += (hash << 10);
2444 hash ^= (hash >> 6);
2445 }
2446 hash += (hash << 3);
2447 hash ^= (hash >> 11);
2448 hash += (hash << 15);
2449 return (hash & rtablemask);
2450 }
2451
2452 static vnode_t *
make_rnode(nfs_fhandle * fh,rhashq_t * rhtp,struct vfs * vfsp,struct vnodeops * vops,int (* putapage)(vnode_t *,page_t *,u_offset_t *,size_t *,int,cred_t *),int (* compar)(const void *,const void *),int * newnode,cred_t * cr,char * dnm,char * nm)2453 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2454 struct vnodeops *vops,
2455 int (*putapage)(vnode_t *, page_t *, u_offset_t *, size_t *, int, cred_t *),
2456 int (*compar)(const void *, const void *),
2457 int *newnode, cred_t *cr, char *dnm, char *nm)
2458 {
2459 rnode_t *rp;
2460 rnode_t *trp;
2461 vnode_t *vp;
2462 mntinfo_t *mi;
2463
2464 ASSERT(RW_READ_HELD(&rhtp->r_lock));
2465
2466 mi = VFTOMI(vfsp);
2467 start:
2468 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2469 vp = RTOV(rp);
2470 nfs_set_vroot(vp);
2471 *newnode = 0;
2472 return (vp);
2473 }
2474 rw_exit(&rhtp->r_lock);
2475
2476 mutex_enter(&rpfreelist_lock);
2477 if (rpfreelist != NULL && rnew >= nrnode) {
2478 rp = rpfreelist;
2479 rp_rmfree(rp);
2480 mutex_exit(&rpfreelist_lock);
2481
2482 vp = RTOV(rp);
2483
2484 if (rp->r_flags & RHASHED) {
2485 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2486 mutex_enter(&vp->v_lock);
2487 if (vp->v_count > 1) {
2488 vp->v_count--;
2489 mutex_exit(&vp->v_lock);
2490 rw_exit(&rp->r_hashq->r_lock);
2491 rw_enter(&rhtp->r_lock, RW_READER);
2492 goto start;
2493 }
2494 mutex_exit(&vp->v_lock);
2495 rp_rmhash_locked(rp);
2496 rw_exit(&rp->r_hashq->r_lock);
2497 }
2498
2499 rinactive(rp, cr);
2500
2501 mutex_enter(&vp->v_lock);
2502 if (vp->v_count > 1) {
2503 vp->v_count--;
2504 mutex_exit(&vp->v_lock);
2505 rw_enter(&rhtp->r_lock, RW_READER);
2506 goto start;
2507 }
2508 mutex_exit(&vp->v_lock);
2509 vn_invalid(vp);
2510 /*
2511 * destroy old locks before bzero'ing and
2512 * recreating the locks below.
2513 */
2514 nfs_rw_destroy(&rp->r_rwlock);
2515 nfs_rw_destroy(&rp->r_lkserlock);
2516 mutex_destroy(&rp->r_statelock);
2517 cv_destroy(&rp->r_cv);
2518 cv_destroy(&rp->r_commit.c_cv);
2519 nfs_free_r_path(rp);
2520 avl_destroy(&rp->r_dir);
2521 /*
2522 * Make sure that if rnode is recycled then
2523 * VFS count is decremented properly before
2524 * reuse.
2525 */
2526 VFS_RELE(vp->v_vfsp);
2527 vn_reinit(vp);
2528 } else {
2529 vnode_t *new_vp;
2530
2531 mutex_exit(&rpfreelist_lock);
2532
2533 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2534 new_vp = vn_alloc(KM_SLEEP);
2535
2536 atomic_add_long((ulong_t *)&rnew, 1);
2537 #ifdef DEBUG
2538 clstat_debug.nrnode.value.ui64++;
2539 #endif
2540 vp = new_vp;
2541 }
2542
2543 bzero(rp, sizeof (*rp));
2544 rp->r_vnode = vp;
2545 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2546 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2547 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2548 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2549 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2550 rp->r_fh.fh_len = fh->fh_len;
2551 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2552 rp->r_server = mi->mi_curr_serv;
2553 if (FAILOVER_MOUNT(mi)) {
2554 /*
2555 * If replicated servers, stash pathnames
2556 */
2557 if (dnm != NULL && nm != NULL) {
2558 char *s, *p;
2559 uint_t len;
2560
2561 len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2562 rp->r_path = kmem_alloc(len, KM_SLEEP);
2563 #ifdef DEBUG
2564 clstat_debug.rpath.value.ui64 += len;
2565 #endif
2566 s = rp->r_path;
2567 for (p = dnm; *p; p++)
2568 *s++ = *p;
2569 *s++ = '/';
2570 for (p = nm; *p; p++)
2571 *s++ = *p;
2572 *s = '\0';
2573 } else {
2574 /* special case for root */
2575 rp->r_path = kmem_alloc(2, KM_SLEEP);
2576 #ifdef DEBUG
2577 clstat_debug.rpath.value.ui64 += 2;
2578 #endif
2579 *rp->r_path = '.';
2580 *(rp->r_path + 1) = '\0';
2581 }
2582 }
2583 VFS_HOLD(vfsp);
2584 rp->r_putapage = putapage;
2585 rp->r_hashq = rhtp;
2586 rp->r_flags = RREADDIRPLUS;
2587 avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2588 offsetof(rddir_cache, tree));
2589 vn_setops(vp, vops);
2590 vp->v_data = (caddr_t)rp;
2591 vp->v_vfsp = vfsp;
2592 vp->v_type = VNON;
2593 vp->v_flag |= VMODSORT;
2594 nfs_set_vroot(vp);
2595
2596 /*
2597 * There is a race condition if someone else
2598 * alloc's the rnode while no locks are held, so we
2599 * check again and recover if found.
2600 */
2601 rw_enter(&rhtp->r_lock, RW_WRITER);
2602 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2603 vp = RTOV(trp);
2604 nfs_set_vroot(vp);
2605 *newnode = 0;
2606 rw_exit(&rhtp->r_lock);
2607 rp_addfree(rp, cr);
2608 rw_enter(&rhtp->r_lock, RW_READER);
2609 return (vp);
2610 }
2611 rp_addhash(rp);
2612 *newnode = 1;
2613 return (vp);
2614 }
2615
2616 /*
2617 * Callback function to check if the page should be marked as
2618 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2619 */
2620 int
nfs_setmod_check(page_t * pp)2621 nfs_setmod_check(page_t *pp)
2622 {
2623 if (pp->p_fsdata != C_NOCOMMIT) {
2624 pp->p_fsdata = C_NOCOMMIT;
2625 return (1);
2626 }
2627 return (0);
2628 }
2629
2630 static void
nfs_set_vroot(vnode_t * vp)2631 nfs_set_vroot(vnode_t *vp)
2632 {
2633 rnode_t *rp;
2634 nfs_fhandle *rootfh;
2635
2636 rp = VTOR(vp);
2637 rootfh = &rp->r_server->sv_fhandle;
2638 if (rootfh->fh_len == rp->r_fh.fh_len &&
2639 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2640 if (!(vp->v_flag & VROOT)) {
2641 mutex_enter(&vp->v_lock);
2642 vp->v_flag |= VROOT;
2643 mutex_exit(&vp->v_lock);
2644 }
2645 }
2646 }
2647
2648 static void
nfs_free_r_path(rnode_t * rp)2649 nfs_free_r_path(rnode_t *rp)
2650 {
2651 char *path;
2652 size_t len;
2653
2654 path = rp->r_path;
2655 if (path) {
2656 rp->r_path = NULL;
2657 len = strlen(path) + 1;
2658 kmem_free(path, len);
2659 #ifdef DEBUG
2660 clstat_debug.rpath.value.ui64 -= len;
2661 #endif
2662 }
2663 }
2664
2665 /*
2666 * Put an rnode on the free list.
2667 *
2668 * Rnodes which were allocated above and beyond the normal limit
2669 * are immediately freed.
2670 */
2671 void
rp_addfree(rnode_t * rp,cred_t * cr)2672 rp_addfree(rnode_t *rp, cred_t *cr)
2673 {
2674 vnode_t *vp;
2675 struct vfs *vfsp;
2676
2677 vp = RTOV(rp);
2678 ASSERT(vp->v_count >= 1);
2679 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2680
2681 /*
2682 * If we have too many rnodes allocated and there are no
2683 * references to this rnode, or if the rnode is no longer
2684 * accessible by it does not reside in the hash queues,
2685 * or if an i/o error occurred while writing to the file,
2686 * then just free it instead of putting it on the rnode
2687 * freelist.
2688 */
2689 vfsp = vp->v_vfsp;
2690 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2691 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2692 if (rp->r_flags & RHASHED) {
2693 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2694 mutex_enter(&vp->v_lock);
2695 if (vp->v_count > 1) {
2696 vp->v_count--;
2697 mutex_exit(&vp->v_lock);
2698 rw_exit(&rp->r_hashq->r_lock);
2699 return;
2700 }
2701 mutex_exit(&vp->v_lock);
2702 rp_rmhash_locked(rp);
2703 rw_exit(&rp->r_hashq->r_lock);
2704 }
2705
2706 rinactive(rp, cr);
2707
2708 /*
2709 * Recheck the vnode reference count. We need to
2710 * make sure that another reference has not been
2711 * acquired while we were not holding v_lock. The
2712 * rnode is not in the rnode hash queues, so the
2713 * only way for a reference to have been acquired
2714 * is for a VOP_PUTPAGE because the rnode was marked
2715 * with RDIRTY or for a modified page. This
2716 * reference may have been acquired before our call
2717 * to rinactive. The i/o may have been completed,
2718 * thus allowing rinactive to complete, but the
2719 * reference to the vnode may not have been released
2720 * yet. In any case, the rnode can not be destroyed
2721 * until the other references to this vnode have been
2722 * released. The other references will take care of
2723 * either destroying the rnode or placing it on the
2724 * rnode freelist. If there are no other references,
2725 * then the rnode may be safely destroyed.
2726 */
2727 mutex_enter(&vp->v_lock);
2728 if (vp->v_count > 1) {
2729 vp->v_count--;
2730 mutex_exit(&vp->v_lock);
2731 return;
2732 }
2733 mutex_exit(&vp->v_lock);
2734
2735 destroy_rnode(rp);
2736 return;
2737 }
2738
2739 /*
2740 * Lock the hash queue and then recheck the reference count
2741 * to ensure that no other threads have acquired a reference
2742 * to indicate that the rnode should not be placed on the
2743 * freelist. If another reference has been acquired, then
2744 * just release this one and let the other thread complete
2745 * the processing of adding this rnode to the freelist.
2746 */
2747 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2748
2749 mutex_enter(&vp->v_lock);
2750 if (vp->v_count > 1) {
2751 vp->v_count--;
2752 mutex_exit(&vp->v_lock);
2753 rw_exit(&rp->r_hashq->r_lock);
2754 return;
2755 }
2756 mutex_exit(&vp->v_lock);
2757
2758 /*
2759 * If there is no cached data or metadata for this file, then
2760 * put the rnode on the front of the freelist so that it will
2761 * be reused before other rnodes which may have cached data or
2762 * metadata associated with them.
2763 */
2764 mutex_enter(&rpfreelist_lock);
2765 if (rpfreelist == NULL) {
2766 rp->r_freef = rp;
2767 rp->r_freeb = rp;
2768 rpfreelist = rp;
2769 } else {
2770 rp->r_freef = rpfreelist;
2771 rp->r_freeb = rpfreelist->r_freeb;
2772 rpfreelist->r_freeb->r_freef = rp;
2773 rpfreelist->r_freeb = rp;
2774 if (!vn_has_cached_data(vp) &&
2775 !HAVE_RDDIR_CACHE(rp) &&
2776 rp->r_symlink.contents == NULL &&
2777 rp->r_secattr == NULL &&
2778 rp->r_pathconf == NULL)
2779 rpfreelist = rp;
2780 }
2781 mutex_exit(&rpfreelist_lock);
2782
2783 rw_exit(&rp->r_hashq->r_lock);
2784 }
2785
2786 /*
2787 * Remove an rnode from the free list.
2788 *
2789 * The caller must be holding rpfreelist_lock and the rnode
2790 * must be on the freelist.
2791 */
2792 static void
rp_rmfree(rnode_t * rp)2793 rp_rmfree(rnode_t *rp)
2794 {
2795
2796 ASSERT(MUTEX_HELD(&rpfreelist_lock));
2797 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2798
2799 if (rp == rpfreelist) {
2800 rpfreelist = rp->r_freef;
2801 if (rp == rpfreelist)
2802 rpfreelist = NULL;
2803 }
2804
2805 rp->r_freeb->r_freef = rp->r_freef;
2806 rp->r_freef->r_freeb = rp->r_freeb;
2807
2808 rp->r_freef = rp->r_freeb = NULL;
2809 }
2810
2811 /*
2812 * Put a rnode in the hash table.
2813 *
2814 * The caller must be holding the exclusive hash queue lock.
2815 */
2816 static void
rp_addhash(rnode_t * rp)2817 rp_addhash(rnode_t *rp)
2818 {
2819
2820 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2821 ASSERT(!(rp->r_flags & RHASHED));
2822
2823 rp->r_hashf = rp->r_hashq->r_hashf;
2824 rp->r_hashq->r_hashf = rp;
2825 rp->r_hashb = (rnode_t *)rp->r_hashq;
2826 rp->r_hashf->r_hashb = rp;
2827
2828 mutex_enter(&rp->r_statelock);
2829 rp->r_flags |= RHASHED;
2830 mutex_exit(&rp->r_statelock);
2831 }
2832
2833 /*
2834 * Remove a rnode from the hash table.
2835 *
2836 * The caller must be holding the hash queue lock.
2837 */
2838 static void
rp_rmhash_locked(rnode_t * rp)2839 rp_rmhash_locked(rnode_t *rp)
2840 {
2841
2842 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2843 ASSERT(rp->r_flags & RHASHED);
2844
2845 rp->r_hashb->r_hashf = rp->r_hashf;
2846 rp->r_hashf->r_hashb = rp->r_hashb;
2847
2848 mutex_enter(&rp->r_statelock);
2849 rp->r_flags &= ~RHASHED;
2850 mutex_exit(&rp->r_statelock);
2851 }
2852
2853 /*
2854 * Remove a rnode from the hash table.
2855 *
2856 * The caller must not be holding the hash queue lock.
2857 */
2858 void
rp_rmhash(rnode_t * rp)2859 rp_rmhash(rnode_t *rp)
2860 {
2861
2862 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2863 rp_rmhash_locked(rp);
2864 rw_exit(&rp->r_hashq->r_lock);
2865 }
2866
2867 /*
2868 * Lookup a rnode by fhandle.
2869 *
2870 * The caller must be holding the hash queue lock, either shared or exclusive.
2871 */
2872 static rnode_t *
rfind(rhashq_t * rhtp,nfs_fhandle * fh,struct vfs * vfsp)2873 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2874 {
2875 rnode_t *rp;
2876 vnode_t *vp;
2877
2878 ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2879
2880 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2881 vp = RTOV(rp);
2882 if (vp->v_vfsp == vfsp &&
2883 rp->r_fh.fh_len == fh->fh_len &&
2884 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2885 /*
2886 * remove rnode from free list, if necessary.
2887 */
2888 if (rp->r_freef != NULL) {
2889 mutex_enter(&rpfreelist_lock);
2890 /*
2891 * If the rnode is on the freelist,
2892 * then remove it and use that reference
2893 * as the new reference. Otherwise,
2894 * need to increment the reference count.
2895 */
2896 if (rp->r_freef != NULL) {
2897 rp_rmfree(rp);
2898 mutex_exit(&rpfreelist_lock);
2899 } else {
2900 mutex_exit(&rpfreelist_lock);
2901 VN_HOLD(vp);
2902 }
2903 } else
2904 VN_HOLD(vp);
2905 return (rp);
2906 }
2907 }
2908 return (NULL);
2909 }
2910
2911 /*
2912 * Return 1 if there is a active vnode belonging to this vfs in the
2913 * rtable cache.
2914 *
2915 * Several of these checks are done without holding the usual
2916 * locks. This is safe because destroy_rtable(), rp_addfree(),
2917 * etc. will redo the necessary checks before actually destroying
2918 * any rnodes.
2919 */
2920 int
check_rtable(struct vfs * vfsp)2921 check_rtable(struct vfs *vfsp)
2922 {
2923 int index;
2924 rnode_t *rp;
2925 vnode_t *vp;
2926
2927 for (index = 0; index < rtablesize; index++) {
2928 rw_enter(&rtable[index].r_lock, RW_READER);
2929 for (rp = rtable[index].r_hashf;
2930 rp != (rnode_t *)(&rtable[index]);
2931 rp = rp->r_hashf) {
2932 vp = RTOV(rp);
2933 if (vp->v_vfsp == vfsp) {
2934 if (rp->r_freef == NULL ||
2935 (vn_has_cached_data(vp) &&
2936 (rp->r_flags & RDIRTY)) ||
2937 rp->r_count > 0) {
2938 rw_exit(&rtable[index].r_lock);
2939 return (1);
2940 }
2941 }
2942 }
2943 rw_exit(&rtable[index].r_lock);
2944 }
2945 return (0);
2946 }
2947
2948 /*
2949 * Destroy inactive vnodes from the hash queues which belong to this
2950 * vfs. It is essential that we destroy all inactive vnodes during a
2951 * forced unmount as well as during a normal unmount.
2952 */
2953 void
destroy_rtable(struct vfs * vfsp,cred_t * cr)2954 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2955 {
2956 int index;
2957 rnode_t *rp;
2958 rnode_t *rlist;
2959 rnode_t *r_hashf;
2960 vnode_t *vp;
2961
2962 rlist = NULL;
2963
2964 for (index = 0; index < rtablesize; index++) {
2965 rw_enter(&rtable[index].r_lock, RW_WRITER);
2966 for (rp = rtable[index].r_hashf;
2967 rp != (rnode_t *)(&rtable[index]);
2968 rp = r_hashf) {
2969 /* save the hash pointer before destroying */
2970 r_hashf = rp->r_hashf;
2971 vp = RTOV(rp);
2972 if (vp->v_vfsp == vfsp) {
2973 mutex_enter(&rpfreelist_lock);
2974 if (rp->r_freef != NULL) {
2975 rp_rmfree(rp);
2976 mutex_exit(&rpfreelist_lock);
2977 rp_rmhash_locked(rp);
2978 rp->r_hashf = rlist;
2979 rlist = rp;
2980 } else
2981 mutex_exit(&rpfreelist_lock);
2982 }
2983 }
2984 rw_exit(&rtable[index].r_lock);
2985 }
2986
2987 for (rp = rlist; rp != NULL; rp = rlist) {
2988 rlist = rp->r_hashf;
2989 /*
2990 * This call to rp_addfree will end up destroying the
2991 * rnode, but in a safe way with the appropriate set
2992 * of checks done.
2993 */
2994 rp_addfree(rp, cr);
2995 }
2996
2997 }
2998
2999 /*
3000 * This routine destroys all the resources associated with the rnode
3001 * and then the rnode itself.
3002 */
3003 static void
destroy_rnode(rnode_t * rp)3004 destroy_rnode(rnode_t *rp)
3005 {
3006 vnode_t *vp;
3007 vfs_t *vfsp;
3008
3009 vp = RTOV(rp);
3010 vfsp = vp->v_vfsp;
3011
3012 ASSERT(vp->v_count == 1);
3013 ASSERT(rp->r_count == 0);
3014 ASSERT(rp->r_lmpl == NULL);
3015 ASSERT(rp->r_mapcnt == 0);
3016 ASSERT(!(rp->r_flags & RHASHED));
3017 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3018 atomic_add_long((ulong_t *)&rnew, -1);
3019 #ifdef DEBUG
3020 clstat_debug.nrnode.value.ui64--;
3021 #endif
3022 nfs_rw_destroy(&rp->r_rwlock);
3023 nfs_rw_destroy(&rp->r_lkserlock);
3024 mutex_destroy(&rp->r_statelock);
3025 cv_destroy(&rp->r_cv);
3026 cv_destroy(&rp->r_commit.c_cv);
3027 if (rp->r_flags & RDELMAPLIST)
3028 list_destroy(&rp->r_indelmap);
3029 nfs_free_r_path(rp);
3030 avl_destroy(&rp->r_dir);
3031 vn_invalid(vp);
3032 vn_free(vp);
3033 kmem_cache_free(rnode_cache, rp);
3034 VFS_RELE(vfsp);
3035 }
3036
3037 /*
3038 * Flush all vnodes in this (or every) vfs.
3039 * Used by nfs_sync and by nfs_unmount.
3040 */
3041 void
rflush(struct vfs * vfsp,cred_t * cr)3042 rflush(struct vfs *vfsp, cred_t *cr)
3043 {
3044 int index;
3045 rnode_t *rp;
3046 vnode_t *vp, **vplist;
3047 long num, cnt;
3048
3049 /*
3050 * Check to see whether there is anything to do.
3051 */
3052 num = rnew;
3053 if (num == 0)
3054 return;
3055
3056 /*
3057 * Allocate a slot for all currently active rnodes on the
3058 * supposition that they all may need flushing.
3059 */
3060 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3061 cnt = 0;
3062
3063 /*
3064 * Walk the hash queues looking for rnodes with page
3065 * lists associated with them. Make a list of these
3066 * files.
3067 */
3068 for (index = 0; index < rtablesize; index++) {
3069 rw_enter(&rtable[index].r_lock, RW_READER);
3070 for (rp = rtable[index].r_hashf;
3071 rp != (rnode_t *)(&rtable[index]);
3072 rp = rp->r_hashf) {
3073 vp = RTOV(rp);
3074 /*
3075 * Don't bother sync'ing a vp if it
3076 * is part of virtual swap device or
3077 * if VFS is read-only
3078 */
3079 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3080 continue;
3081 /*
3082 * If flushing all mounted file systems or
3083 * the vnode belongs to this vfs, has pages
3084 * and is marked as either dirty or mmap'd,
3085 * hold and add this vnode to the list of
3086 * vnodes to flush.
3087 */
3088 if ((vfsp == NULL || vp->v_vfsp == vfsp) &&
3089 vn_has_cached_data(vp) &&
3090 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3091 VN_HOLD(vp);
3092 vplist[cnt++] = vp;
3093 if (cnt == num) {
3094 rw_exit(&rtable[index].r_lock);
3095 goto toomany;
3096 }
3097 }
3098 }
3099 rw_exit(&rtable[index].r_lock);
3100 }
3101 toomany:
3102
3103 /*
3104 * Flush and release all of the files on the list.
3105 */
3106 while (cnt-- > 0) {
3107 vp = vplist[cnt];
3108 (void) VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_ASYNC, cr, NULL);
3109 VN_RELE(vp);
3110 }
3111
3112 /*
3113 * Free the space allocated to hold the list.
3114 */
3115 kmem_free(vplist, num * sizeof (*vplist));
3116 }
3117
3118 /*
3119 * This probably needs to be larger than or equal to
3120 * log2(sizeof (struct rnode)) due to the way that rnodes are
3121 * allocated.
3122 */
3123 #define ACACHE_SHIFT_BITS 9
3124
3125 static int
acachehash(rnode_t * rp,cred_t * cr)3126 acachehash(rnode_t *rp, cred_t *cr)
3127 {
3128
3129 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3130 acachemask);
3131 }
3132
3133 #ifdef DEBUG
3134 static long nfs_access_cache_hits = 0;
3135 static long nfs_access_cache_misses = 0;
3136 #endif
3137
3138 nfs_access_type_t
nfs_access_check(rnode_t * rp,uint32_t acc,cred_t * cr)3139 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3140 {
3141 vnode_t *vp;
3142 acache_t *ap;
3143 acache_hash_t *hp;
3144 nfs_access_type_t all;
3145
3146 vp = RTOV(rp);
3147 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3148 return (NFS_ACCESS_UNKNOWN);
3149
3150 if (rp->r_acache != NULL) {
3151 hp = &acache[acachehash(rp, cr)];
3152 rw_enter(&hp->lock, RW_READER);
3153 ap = hp->next;
3154 while (ap != (acache_t *)hp) {
3155 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3156 if ((ap->known & acc) == acc) {
3157 #ifdef DEBUG
3158 nfs_access_cache_hits++;
3159 #endif
3160 if ((ap->allowed & acc) == acc)
3161 all = NFS_ACCESS_ALLOWED;
3162 else
3163 all = NFS_ACCESS_DENIED;
3164 } else {
3165 #ifdef DEBUG
3166 nfs_access_cache_misses++;
3167 #endif
3168 all = NFS_ACCESS_UNKNOWN;
3169 }
3170 rw_exit(&hp->lock);
3171 return (all);
3172 }
3173 ap = ap->next;
3174 }
3175 rw_exit(&hp->lock);
3176 }
3177
3178 #ifdef DEBUG
3179 nfs_access_cache_misses++;
3180 #endif
3181 return (NFS_ACCESS_UNKNOWN);
3182 }
3183
3184 void
nfs_access_cache(rnode_t * rp,uint32_t acc,uint32_t resacc,cred_t * cr)3185 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3186 {
3187 acache_t *ap;
3188 acache_t *nap;
3189 acache_hash_t *hp;
3190
3191 hp = &acache[acachehash(rp, cr)];
3192
3193 /*
3194 * Allocate now assuming that mostly an allocation will be
3195 * required. This allows the allocation to happen without
3196 * holding the hash bucket locked.
3197 */
3198 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3199 if (nap != NULL) {
3200 nap->known = acc;
3201 nap->allowed = resacc;
3202 nap->rnode = rp;
3203 crhold(cr);
3204 nap->cred = cr;
3205 nap->hashq = hp;
3206 }
3207
3208 rw_enter(&hp->lock, RW_WRITER);
3209
3210 if (rp->r_acache != NULL) {
3211 ap = hp->next;
3212 while (ap != (acache_t *)hp) {
3213 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3214 ap->known |= acc;
3215 ap->allowed &= ~acc;
3216 ap->allowed |= resacc;
3217 rw_exit(&hp->lock);
3218 if (nap != NULL) {
3219 crfree(nap->cred);
3220 kmem_cache_free(acache_cache, nap);
3221 }
3222 return;
3223 }
3224 ap = ap->next;
3225 }
3226 }
3227
3228 if (nap != NULL) {
3229 #ifdef DEBUG
3230 clstat_debug.access.value.ui64++;
3231 #endif
3232 nap->next = hp->next;
3233 hp->next = nap;
3234 nap->next->prev = nap;
3235 nap->prev = (acache_t *)hp;
3236
3237 mutex_enter(&rp->r_statelock);
3238 nap->list = rp->r_acache;
3239 rp->r_acache = nap;
3240 mutex_exit(&rp->r_statelock);
3241 }
3242
3243 rw_exit(&hp->lock);
3244 }
3245
3246 int
nfs_access_purge_rp(rnode_t * rp)3247 nfs_access_purge_rp(rnode_t *rp)
3248 {
3249 acache_t *ap;
3250 acache_t *tmpap;
3251 acache_t *rplist;
3252
3253 /*
3254 * If there aren't any cached entries, then there is nothing
3255 * to free.
3256 */
3257 if (rp->r_acache == NULL)
3258 return (0);
3259
3260 mutex_enter(&rp->r_statelock);
3261 rplist = rp->r_acache;
3262 rp->r_acache = NULL;
3263 mutex_exit(&rp->r_statelock);
3264
3265 /*
3266 * Loop through each entry in the list pointed to in the
3267 * rnode. Remove each of these entries from the hash
3268 * queue that it is on and remove it from the list in
3269 * the rnode.
3270 */
3271 for (ap = rplist; ap != NULL; ap = tmpap) {
3272 rw_enter(&ap->hashq->lock, RW_WRITER);
3273 ap->prev->next = ap->next;
3274 ap->next->prev = ap->prev;
3275 rw_exit(&ap->hashq->lock);
3276
3277 tmpap = ap->list;
3278 crfree(ap->cred);
3279 kmem_cache_free(acache_cache, ap);
3280 #ifdef DEBUG
3281 clstat_debug.access.value.ui64--;
3282 #endif
3283 }
3284
3285 return (1);
3286 }
3287
3288 static const char prefix[] = ".nfs";
3289
3290 static kmutex_t newnum_lock;
3291
3292 int
newnum(void)3293 newnum(void)
3294 {
3295 static uint_t newnum = 0;
3296 uint_t id;
3297
3298 mutex_enter(&newnum_lock);
3299 if (newnum == 0)
3300 newnum = gethrestime_sec() & 0xffff;
3301 id = newnum++;
3302 mutex_exit(&newnum_lock);
3303 return (id);
3304 }
3305
3306 char *
newname(void)3307 newname(void)
3308 {
3309 char *news;
3310 char *s;
3311 const char *p;
3312 uint_t id;
3313
3314 id = newnum();
3315 news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3316 s = news;
3317 p = prefix;
3318 while (*p != '\0')
3319 *s++ = *p++;
3320 while (id != 0) {
3321 *s++ = "0123456789ABCDEF"[id & 0x0f];
3322 id >>= 4;
3323 }
3324 *s = '\0';
3325 return (news);
3326 }
3327
3328 /*
3329 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3330 * framework.
3331 */
3332 static int
cl_snapshot(kstat_t * ksp,void * buf,int rw)3333 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3334 {
3335 ksp->ks_snaptime = gethrtime();
3336 if (rw == KSTAT_WRITE) {
3337 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3338 #ifdef DEBUG
3339 /*
3340 * Currently only the global zone can write to kstats, but we
3341 * add the check just for paranoia.
3342 */
3343 if (INGLOBALZONE(curproc))
3344 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3345 sizeof (clstat_debug));
3346 #endif
3347 } else {
3348 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3349 #ifdef DEBUG
3350 /*
3351 * If we're displaying the "global" debug kstat values, we
3352 * display them as-is to all zones since in fact they apply to
3353 * the system as a whole.
3354 */
3355 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3356 sizeof (clstat_debug));
3357 #endif
3358 }
3359 return (0);
3360 }
3361
3362 static void *
clinit_zone(zoneid_t zoneid)3363 clinit_zone(zoneid_t zoneid)
3364 {
3365 kstat_t *nfs_client_kstat;
3366 struct nfs_clnt *nfscl;
3367 uint_t ndata;
3368
3369 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3370 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3371 nfscl->nfscl_chtable = NULL;
3372 nfscl->nfscl_zoneid = zoneid;
3373
3374 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3375 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3376 #ifdef DEBUG
3377 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3378 #endif
3379 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3380 "misc", KSTAT_TYPE_NAMED, ndata,
3381 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3382 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3383 nfs_client_kstat->ks_snapshot = cl_snapshot;
3384 kstat_install(nfs_client_kstat);
3385 }
3386 mutex_enter(&nfs_clnt_list_lock);
3387 list_insert_head(&nfs_clnt_list, nfscl);
3388 mutex_exit(&nfs_clnt_list_lock);
3389 return (nfscl);
3390 }
3391
3392 /*ARGSUSED*/
3393 static void
clfini_zone(zoneid_t zoneid,void * arg)3394 clfini_zone(zoneid_t zoneid, void *arg)
3395 {
3396 struct nfs_clnt *nfscl = arg;
3397 chhead_t *chp, *next;
3398
3399 if (nfscl == NULL)
3400 return;
3401 mutex_enter(&nfs_clnt_list_lock);
3402 list_remove(&nfs_clnt_list, nfscl);
3403 mutex_exit(&nfs_clnt_list_lock);
3404 clreclaim_zone(nfscl, 0);
3405 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3406 ASSERT(chp->ch_list == NULL);
3407 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3408 next = chp->ch_next;
3409 kmem_free(chp, sizeof (*chp));
3410 }
3411 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3412 mutex_destroy(&nfscl->nfscl_chtable_lock);
3413 kmem_free(nfscl, sizeof (*nfscl));
3414 }
3415
3416 /*
3417 * Called by endpnt_destructor to make sure the client handles are
3418 * cleaned up before the RPC endpoints. This becomes a no-op if
3419 * clfini_zone (above) is called first. This function is needed
3420 * (rather than relying on clfini_zone to clean up) because the ZSD
3421 * callbacks have no ordering mechanism, so we have no way to ensure
3422 * that clfini_zone is called before endpnt_destructor.
3423 */
3424 void
clcleanup_zone(zoneid_t zoneid)3425 clcleanup_zone(zoneid_t zoneid)
3426 {
3427 struct nfs_clnt *nfscl;
3428
3429 mutex_enter(&nfs_clnt_list_lock);
3430 nfscl = list_head(&nfs_clnt_list);
3431 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3432 if (nfscl->nfscl_zoneid == zoneid) {
3433 clreclaim_zone(nfscl, 0);
3434 break;
3435 }
3436 }
3437 mutex_exit(&nfs_clnt_list_lock);
3438 }
3439
3440 int
nfs_subrinit(void)3441 nfs_subrinit(void)
3442 {
3443 int i;
3444 ulong_t nrnode_max;
3445
3446 /*
3447 * Allocate and initialize the rnode hash queues
3448 */
3449 if (nrnode <= 0)
3450 nrnode = ncsize;
3451 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3452 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3453 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3454 "setting nrnode to max value of %ld", nrnode_max);
3455 nrnode = nrnode_max;
3456 }
3457
3458 rtablesize = 1 << highbit(nrnode / hashlen);
3459 rtablemask = rtablesize - 1;
3460 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3461 for (i = 0; i < rtablesize; i++) {
3462 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3463 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3464 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3465 }
3466 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3467 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3468
3469 /*
3470 * Allocate and initialize the access cache
3471 */
3472
3473 /*
3474 * Initial guess is one access cache entry per rnode unless
3475 * nacache is set to a non-zero value and then it is used to
3476 * indicate a guess at the number of access cache entries.
3477 */
3478 if (nacache > 0)
3479 acachesize = 1 << highbit(nacache / hashlen);
3480 else
3481 acachesize = rtablesize;
3482 acachemask = acachesize - 1;
3483 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3484 for (i = 0; i < acachesize; i++) {
3485 acache[i].next = (acache_t *)&acache[i];
3486 acache[i].prev = (acache_t *)&acache[i];
3487 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3488 }
3489 acache_cache = kmem_cache_create("nfs_access_cache",
3490 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3491 /*
3492 * Allocate and initialize the client handle cache
3493 */
3494 chtab_cache = kmem_cache_create("client_handle_cache",
3495 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3496 /*
3497 * Initialize the list of per-zone client handles (and associated data).
3498 * This needs to be done before we call zone_key_create().
3499 */
3500 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3501 offsetof(struct nfs_clnt, nfscl_node));
3502 /*
3503 * Initialize the zone_key for per-zone client handle lists.
3504 */
3505 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3506 /*
3507 * Initialize the various mutexes and reader/writer locks
3508 */
3509 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3510 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3511 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3512
3513 /*
3514 * Assign unique major number for all nfs mounts
3515 */
3516 if ((nfs_major = getudev()) == -1) {
3517 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3518 "nfs: init: can't get unique device number");
3519 nfs_major = 0;
3520 }
3521 nfs_minor = 0;
3522
3523 if (nfs3_jukebox_delay == 0)
3524 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3525
3526 return (0);
3527 }
3528
3529 void
nfs_subrfini(void)3530 nfs_subrfini(void)
3531 {
3532 int i;
3533
3534 /*
3535 * Deallocate the rnode hash queues
3536 */
3537 kmem_cache_destroy(rnode_cache);
3538
3539 for (i = 0; i < rtablesize; i++)
3540 rw_destroy(&rtable[i].r_lock);
3541 kmem_free(rtable, rtablesize * sizeof (*rtable));
3542
3543 /*
3544 * Deallocated the access cache
3545 */
3546 kmem_cache_destroy(acache_cache);
3547
3548 for (i = 0; i < acachesize; i++)
3549 rw_destroy(&acache[i].lock);
3550 kmem_free(acache, acachesize * sizeof (*acache));
3551
3552 /*
3553 * Deallocate the client handle cache
3554 */
3555 kmem_cache_destroy(chtab_cache);
3556
3557 /*
3558 * Destroy the various mutexes and reader/writer locks
3559 */
3560 mutex_destroy(&rpfreelist_lock);
3561 mutex_destroy(&newnum_lock);
3562 mutex_destroy(&nfs_minor_lock);
3563 (void) zone_key_delete(nfsclnt_zone_key);
3564 }
3565
3566 enum nfsstat
puterrno(int error)3567 puterrno(int error)
3568 {
3569
3570 switch (error) {
3571 case EOPNOTSUPP:
3572 return (NFSERR_OPNOTSUPP);
3573 case ENAMETOOLONG:
3574 return (NFSERR_NAMETOOLONG);
3575 case ENOTEMPTY:
3576 return (NFSERR_NOTEMPTY);
3577 case EDQUOT:
3578 return (NFSERR_DQUOT);
3579 case ESTALE:
3580 return (NFSERR_STALE);
3581 case EREMOTE:
3582 return (NFSERR_REMOTE);
3583 case ENOSYS:
3584 return (NFSERR_OPNOTSUPP);
3585 case EOVERFLOW:
3586 return (NFSERR_INVAL);
3587 default:
3588 return ((enum nfsstat)error);
3589 }
3590 /* NOTREACHED */
3591 }
3592
3593 int
geterrno(enum nfsstat status)3594 geterrno(enum nfsstat status)
3595 {
3596
3597 switch (status) {
3598 case NFSERR_OPNOTSUPP:
3599 return (EOPNOTSUPP);
3600 case NFSERR_NAMETOOLONG:
3601 return (ENAMETOOLONG);
3602 case NFSERR_NOTEMPTY:
3603 return (ENOTEMPTY);
3604 case NFSERR_DQUOT:
3605 return (EDQUOT);
3606 case NFSERR_STALE:
3607 return (ESTALE);
3608 case NFSERR_REMOTE:
3609 return (EREMOTE);
3610 case NFSERR_WFLUSH:
3611 return (EIO);
3612 default:
3613 return ((int)status);
3614 }
3615 /* NOTREACHED */
3616 }
3617
3618 enum nfsstat3
puterrno3(int error)3619 puterrno3(int error)
3620 {
3621
3622 #ifdef DEBUG
3623 switch (error) {
3624 case 0:
3625 return (NFS3_OK);
3626 case EPERM:
3627 return (NFS3ERR_PERM);
3628 case ENOENT:
3629 return (NFS3ERR_NOENT);
3630 case EIO:
3631 return (NFS3ERR_IO);
3632 case ENXIO:
3633 return (NFS3ERR_NXIO);
3634 case EACCES:
3635 return (NFS3ERR_ACCES);
3636 case EEXIST:
3637 return (NFS3ERR_EXIST);
3638 case EXDEV:
3639 return (NFS3ERR_XDEV);
3640 case ENODEV:
3641 return (NFS3ERR_NODEV);
3642 case ENOTDIR:
3643 return (NFS3ERR_NOTDIR);
3644 case EISDIR:
3645 return (NFS3ERR_ISDIR);
3646 case EINVAL:
3647 return (NFS3ERR_INVAL);
3648 case EFBIG:
3649 return (NFS3ERR_FBIG);
3650 case ENOSPC:
3651 return (NFS3ERR_NOSPC);
3652 case EROFS:
3653 return (NFS3ERR_ROFS);
3654 case EMLINK:
3655 return (NFS3ERR_MLINK);
3656 case ENAMETOOLONG:
3657 return (NFS3ERR_NAMETOOLONG);
3658 case ENOTEMPTY:
3659 return (NFS3ERR_NOTEMPTY);
3660 case EDQUOT:
3661 return (NFS3ERR_DQUOT);
3662 case ESTALE:
3663 return (NFS3ERR_STALE);
3664 case EREMOTE:
3665 return (NFS3ERR_REMOTE);
3666 case ENOSYS:
3667 case EOPNOTSUPP:
3668 return (NFS3ERR_NOTSUPP);
3669 case EOVERFLOW:
3670 return (NFS3ERR_INVAL);
3671 default:
3672 zcmn_err(getzoneid(), CE_WARN,
3673 "puterrno3: got error %d", error);
3674 return ((enum nfsstat3)error);
3675 }
3676 #else
3677 switch (error) {
3678 case ENAMETOOLONG:
3679 return (NFS3ERR_NAMETOOLONG);
3680 case ENOTEMPTY:
3681 return (NFS3ERR_NOTEMPTY);
3682 case EDQUOT:
3683 return (NFS3ERR_DQUOT);
3684 case ESTALE:
3685 return (NFS3ERR_STALE);
3686 case ENOSYS:
3687 case EOPNOTSUPP:
3688 return (NFS3ERR_NOTSUPP);
3689 case EREMOTE:
3690 return (NFS3ERR_REMOTE);
3691 case EOVERFLOW:
3692 return (NFS3ERR_INVAL);
3693 default:
3694 return ((enum nfsstat3)error);
3695 }
3696 #endif
3697 }
3698
3699 int
geterrno3(enum nfsstat3 status)3700 geterrno3(enum nfsstat3 status)
3701 {
3702
3703 #ifdef DEBUG
3704 switch (status) {
3705 case NFS3_OK:
3706 return (0);
3707 case NFS3ERR_PERM:
3708 return (EPERM);
3709 case NFS3ERR_NOENT:
3710 return (ENOENT);
3711 case NFS3ERR_IO:
3712 return (EIO);
3713 case NFS3ERR_NXIO:
3714 return (ENXIO);
3715 case NFS3ERR_ACCES:
3716 return (EACCES);
3717 case NFS3ERR_EXIST:
3718 return (EEXIST);
3719 case NFS3ERR_XDEV:
3720 return (EXDEV);
3721 case NFS3ERR_NODEV:
3722 return (ENODEV);
3723 case NFS3ERR_NOTDIR:
3724 return (ENOTDIR);
3725 case NFS3ERR_ISDIR:
3726 return (EISDIR);
3727 case NFS3ERR_INVAL:
3728 return (EINVAL);
3729 case NFS3ERR_FBIG:
3730 return (EFBIG);
3731 case NFS3ERR_NOSPC:
3732 return (ENOSPC);
3733 case NFS3ERR_ROFS:
3734 return (EROFS);
3735 case NFS3ERR_MLINK:
3736 return (EMLINK);
3737 case NFS3ERR_NAMETOOLONG:
3738 return (ENAMETOOLONG);
3739 case NFS3ERR_NOTEMPTY:
3740 return (ENOTEMPTY);
3741 case NFS3ERR_DQUOT:
3742 return (EDQUOT);
3743 case NFS3ERR_STALE:
3744 return (ESTALE);
3745 case NFS3ERR_REMOTE:
3746 return (EREMOTE);
3747 case NFS3ERR_BADHANDLE:
3748 return (ESTALE);
3749 case NFS3ERR_NOT_SYNC:
3750 return (EINVAL);
3751 case NFS3ERR_BAD_COOKIE:
3752 return (ENOENT);
3753 case NFS3ERR_NOTSUPP:
3754 return (EOPNOTSUPP);
3755 case NFS3ERR_TOOSMALL:
3756 return (EINVAL);
3757 case NFS3ERR_SERVERFAULT:
3758 return (EIO);
3759 case NFS3ERR_BADTYPE:
3760 return (EINVAL);
3761 case NFS3ERR_JUKEBOX:
3762 return (ENXIO);
3763 default:
3764 zcmn_err(getzoneid(), CE_WARN,
3765 "geterrno3: got status %d", status);
3766 return ((int)status);
3767 }
3768 #else
3769 switch (status) {
3770 case NFS3ERR_NAMETOOLONG:
3771 return (ENAMETOOLONG);
3772 case NFS3ERR_NOTEMPTY:
3773 return (ENOTEMPTY);
3774 case NFS3ERR_DQUOT:
3775 return (EDQUOT);
3776 case NFS3ERR_STALE:
3777 case NFS3ERR_BADHANDLE:
3778 return (ESTALE);
3779 case NFS3ERR_NOTSUPP:
3780 return (EOPNOTSUPP);
3781 case NFS3ERR_REMOTE:
3782 return (EREMOTE);
3783 case NFS3ERR_NOT_SYNC:
3784 case NFS3ERR_TOOSMALL:
3785 case NFS3ERR_BADTYPE:
3786 return (EINVAL);
3787 case NFS3ERR_BAD_COOKIE:
3788 return (ENOENT);
3789 case NFS3ERR_SERVERFAULT:
3790 return (EIO);
3791 case NFS3ERR_JUKEBOX:
3792 return (ENXIO);
3793 default:
3794 return ((int)status);
3795 }
3796 #endif
3797 }
3798
3799 rddir_cache *
rddir_cache_alloc(int flags)3800 rddir_cache_alloc(int flags)
3801 {
3802 rddir_cache *rc;
3803
3804 rc = kmem_alloc(sizeof (*rc), flags);
3805 if (rc != NULL) {
3806 rc->entries = NULL;
3807 rc->flags = RDDIR;
3808 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3809 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3810 rc->count = 1;
3811 #ifdef DEBUG
3812 atomic_add_64(&clstat_debug.dirent.value.ui64, 1);
3813 #endif
3814 }
3815 return (rc);
3816 }
3817
3818 static void
rddir_cache_free(rddir_cache * rc)3819 rddir_cache_free(rddir_cache *rc)
3820 {
3821
3822 #ifdef DEBUG
3823 atomic_add_64(&clstat_debug.dirent.value.ui64, -1);
3824 #endif
3825 if (rc->entries != NULL) {
3826 #ifdef DEBUG
3827 rddir_cache_buf_free(rc->entries, rc->buflen);
3828 #else
3829 kmem_free(rc->entries, rc->buflen);
3830 #endif
3831 }
3832 cv_destroy(&rc->cv);
3833 mutex_destroy(&rc->lock);
3834 kmem_free(rc, sizeof (*rc));
3835 }
3836
3837 void
rddir_cache_hold(rddir_cache * rc)3838 rddir_cache_hold(rddir_cache *rc)
3839 {
3840
3841 mutex_enter(&rc->lock);
3842 rc->count++;
3843 mutex_exit(&rc->lock);
3844 }
3845
3846 void
rddir_cache_rele(rddir_cache * rc)3847 rddir_cache_rele(rddir_cache *rc)
3848 {
3849
3850 mutex_enter(&rc->lock);
3851 ASSERT(rc->count > 0);
3852 if (--rc->count == 0) {
3853 mutex_exit(&rc->lock);
3854 rddir_cache_free(rc);
3855 } else
3856 mutex_exit(&rc->lock);
3857 }
3858
3859 #ifdef DEBUG
3860 char *
rddir_cache_buf_alloc(size_t size,int flags)3861 rddir_cache_buf_alloc(size_t size, int flags)
3862 {
3863 char *rc;
3864
3865 rc = kmem_alloc(size, flags);
3866 if (rc != NULL)
3867 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3868 return (rc);
3869 }
3870
3871 void
rddir_cache_buf_free(void * addr,size_t size)3872 rddir_cache_buf_free(void *addr, size_t size)
3873 {
3874
3875 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3876 kmem_free(addr, size);
3877 }
3878 #endif
3879
3880 static int
nfs_free_data_reclaim(rnode_t * rp)3881 nfs_free_data_reclaim(rnode_t *rp)
3882 {
3883 char *contents;
3884 int size;
3885 vsecattr_t *vsp;
3886 nfs3_pathconf_info *info;
3887 int freed;
3888 cred_t *cred;
3889
3890 /*
3891 * Free any held credentials and caches which
3892 * may be associated with this rnode.
3893 */
3894 mutex_enter(&rp->r_statelock);
3895 cred = rp->r_cred;
3896 rp->r_cred = NULL;
3897 contents = rp->r_symlink.contents;
3898 size = rp->r_symlink.size;
3899 rp->r_symlink.contents = NULL;
3900 vsp = rp->r_secattr;
3901 rp->r_secattr = NULL;
3902 info = rp->r_pathconf;
3903 rp->r_pathconf = NULL;
3904 mutex_exit(&rp->r_statelock);
3905
3906 if (cred != NULL)
3907 crfree(cred);
3908
3909 /*
3910 * Free the access cache entries.
3911 */
3912 freed = nfs_access_purge_rp(rp);
3913
3914 if (!HAVE_RDDIR_CACHE(rp) &&
3915 contents == NULL &&
3916 vsp == NULL &&
3917 info == NULL)
3918 return (freed);
3919
3920 /*
3921 * Free the readdir cache entries
3922 */
3923 if (HAVE_RDDIR_CACHE(rp))
3924 nfs_purge_rddir_cache(RTOV(rp));
3925
3926 /*
3927 * Free the symbolic link cache.
3928 */
3929 if (contents != NULL) {
3930
3931 kmem_free((void *)contents, size);
3932 }
3933
3934 /*
3935 * Free any cached ACL.
3936 */
3937 if (vsp != NULL)
3938 nfs_acl_free(vsp);
3939
3940 /*
3941 * Free any cached pathconf information.
3942 */
3943 if (info != NULL)
3944 kmem_free(info, sizeof (*info));
3945
3946 return (1);
3947 }
3948
3949 static int
nfs_active_data_reclaim(rnode_t * rp)3950 nfs_active_data_reclaim(rnode_t *rp)
3951 {
3952 char *contents;
3953 int size;
3954 vsecattr_t *vsp;
3955 nfs3_pathconf_info *info;
3956 int freed;
3957
3958 /*
3959 * Free any held credentials and caches which
3960 * may be associated with this rnode.
3961 */
3962 if (!mutex_tryenter(&rp->r_statelock))
3963 return (0);
3964 contents = rp->r_symlink.contents;
3965 size = rp->r_symlink.size;
3966 rp->r_symlink.contents = NULL;
3967 vsp = rp->r_secattr;
3968 rp->r_secattr = NULL;
3969 info = rp->r_pathconf;
3970 rp->r_pathconf = NULL;
3971 mutex_exit(&rp->r_statelock);
3972
3973 /*
3974 * Free the access cache entries.
3975 */
3976 freed = nfs_access_purge_rp(rp);
3977
3978 if (!HAVE_RDDIR_CACHE(rp) &&
3979 contents == NULL &&
3980 vsp == NULL &&
3981 info == NULL)
3982 return (freed);
3983
3984 /*
3985 * Free the readdir cache entries
3986 */
3987 if (HAVE_RDDIR_CACHE(rp))
3988 nfs_purge_rddir_cache(RTOV(rp));
3989
3990 /*
3991 * Free the symbolic link cache.
3992 */
3993 if (contents != NULL) {
3994
3995 kmem_free((void *)contents, size);
3996 }
3997
3998 /*
3999 * Free any cached ACL.
4000 */
4001 if (vsp != NULL)
4002 nfs_acl_free(vsp);
4003
4004 /*
4005 * Free any cached pathconf information.
4006 */
4007 if (info != NULL)
4008 kmem_free(info, sizeof (*info));
4009
4010 return (1);
4011 }
4012
4013 static int
nfs_free_reclaim(void)4014 nfs_free_reclaim(void)
4015 {
4016 int freed;
4017 rnode_t *rp;
4018
4019 #ifdef DEBUG
4020 clstat_debug.f_reclaim.value.ui64++;
4021 #endif
4022 freed = 0;
4023 mutex_enter(&rpfreelist_lock);
4024 rp = rpfreelist;
4025 if (rp != NULL) {
4026 do {
4027 if (nfs_free_data_reclaim(rp))
4028 freed = 1;
4029 } while ((rp = rp->r_freef) != rpfreelist);
4030 }
4031 mutex_exit(&rpfreelist_lock);
4032 return (freed);
4033 }
4034
4035 static int
nfs_active_reclaim(void)4036 nfs_active_reclaim(void)
4037 {
4038 int freed;
4039 int index;
4040 rnode_t *rp;
4041
4042 #ifdef DEBUG
4043 clstat_debug.a_reclaim.value.ui64++;
4044 #endif
4045 freed = 0;
4046 for (index = 0; index < rtablesize; index++) {
4047 rw_enter(&rtable[index].r_lock, RW_READER);
4048 for (rp = rtable[index].r_hashf;
4049 rp != (rnode_t *)(&rtable[index]);
4050 rp = rp->r_hashf) {
4051 if (nfs_active_data_reclaim(rp))
4052 freed = 1;
4053 }
4054 rw_exit(&rtable[index].r_lock);
4055 }
4056 return (freed);
4057 }
4058
4059 static int
nfs_rnode_reclaim(void)4060 nfs_rnode_reclaim(void)
4061 {
4062 int freed;
4063 rnode_t *rp;
4064 vnode_t *vp;
4065
4066 #ifdef DEBUG
4067 clstat_debug.r_reclaim.value.ui64++;
4068 #endif
4069 freed = 0;
4070 mutex_enter(&rpfreelist_lock);
4071 while ((rp = rpfreelist) != NULL) {
4072 rp_rmfree(rp);
4073 mutex_exit(&rpfreelist_lock);
4074 if (rp->r_flags & RHASHED) {
4075 vp = RTOV(rp);
4076 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4077 mutex_enter(&vp->v_lock);
4078 if (vp->v_count > 1) {
4079 vp->v_count--;
4080 mutex_exit(&vp->v_lock);
4081 rw_exit(&rp->r_hashq->r_lock);
4082 mutex_enter(&rpfreelist_lock);
4083 continue;
4084 }
4085 mutex_exit(&vp->v_lock);
4086 rp_rmhash_locked(rp);
4087 rw_exit(&rp->r_hashq->r_lock);
4088 }
4089 /*
4090 * This call to rp_addfree will end up destroying the
4091 * rnode, but in a safe way with the appropriate set
4092 * of checks done.
4093 */
4094 rp_addfree(rp, CRED());
4095 mutex_enter(&rpfreelist_lock);
4096 }
4097 mutex_exit(&rpfreelist_lock);
4098 return (freed);
4099 }
4100
4101 /*ARGSUSED*/
4102 static void
nfs_reclaim(void * cdrarg)4103 nfs_reclaim(void *cdrarg)
4104 {
4105
4106 #ifdef DEBUG
4107 clstat_debug.reclaim.value.ui64++;
4108 #endif
4109 if (nfs_free_reclaim())
4110 return;
4111
4112 if (nfs_active_reclaim())
4113 return;
4114
4115 (void) nfs_rnode_reclaim();
4116 }
4117
4118 /*
4119 * NFS client failover support
4120 *
4121 * Routines to copy filehandles
4122 */
4123 void
nfscopyfh(caddr_t fhp,vnode_t * vp)4124 nfscopyfh(caddr_t fhp, vnode_t *vp)
4125 {
4126 fhandle_t *dest = (fhandle_t *)fhp;
4127
4128 if (dest != NULL)
4129 *dest = *VTOFH(vp);
4130 }
4131
4132 void
nfs3copyfh(caddr_t fhp,vnode_t * vp)4133 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4134 {
4135 nfs_fh3 *dest = (nfs_fh3 *)fhp;
4136
4137 if (dest != NULL)
4138 *dest = *VTOFH3(vp);
4139 }
4140
4141 /*
4142 * NFS client failover support
4143 *
4144 * failover_safe() will test various conditions to ensure that
4145 * failover is permitted for this vnode. It will be denied
4146 * if:
4147 * 1) the operation in progress does not support failover (NULL fi)
4148 * 2) there are no available replicas (NULL mi_servers->sv_next)
4149 * 3) any locks are outstanding on this file
4150 */
4151 static int
failover_safe(failinfo_t * fi)4152 failover_safe(failinfo_t *fi)
4153 {
4154
4155 /*
4156 * Does this op permit failover?
4157 */
4158 if (fi == NULL || fi->vp == NULL)
4159 return (0);
4160
4161 /*
4162 * Are there any alternates to failover to?
4163 */
4164 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4165 return (0);
4166
4167 /*
4168 * Disable check; we've forced local locking
4169 *
4170 * if (flk_has_remote_locks(fi->vp))
4171 * return (0);
4172 */
4173
4174 /*
4175 * If we have no partial path, we can't do anything
4176 */
4177 if (VTOR(fi->vp)->r_path == NULL)
4178 return (0);
4179
4180 return (1);
4181 }
4182
4183 #include <sys/thread.h>
4184
4185 /*
4186 * NFS client failover support
4187 *
4188 * failover_newserver() will start a search for a new server,
4189 * preferably by starting an async thread to do the work. If
4190 * someone is already doing this (recognizable by MI_BINDINPROG
4191 * being set), it will simply return and the calling thread
4192 * will queue on the mi_failover_cv condition variable.
4193 */
4194 static void
failover_newserver(mntinfo_t * mi)4195 failover_newserver(mntinfo_t *mi)
4196 {
4197 /*
4198 * Check if someone else is doing this already
4199 */
4200 mutex_enter(&mi->mi_lock);
4201 if (mi->mi_flags & MI_BINDINPROG) {
4202 mutex_exit(&mi->mi_lock);
4203 return;
4204 }
4205 mi->mi_flags |= MI_BINDINPROG;
4206
4207 /*
4208 * Need to hold the vfs struct so that it can't be released
4209 * while the failover thread is selecting a new server.
4210 */
4211 VFS_HOLD(mi->mi_vfsp);
4212
4213 /*
4214 * Start a thread to do the real searching.
4215 */
4216 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4217
4218 mutex_exit(&mi->mi_lock);
4219 }
4220
4221 /*
4222 * NFS client failover support
4223 *
4224 * failover_thread() will find a new server to replace the one
4225 * currently in use, wake up other threads waiting on this mount
4226 * point, and die. It will start at the head of the server list
4227 * and poll servers until it finds one with an NFS server which is
4228 * registered and responds to a NULL procedure ping.
4229 *
4230 * XXX failover_thread is unsafe within the scope of the
4231 * present model defined for cpr to suspend the system.
4232 * Specifically, over-the-wire calls made by the thread
4233 * are unsafe. The thread needs to be reevaluated in case of
4234 * future updates to the cpr suspend model.
4235 */
4236 static void
failover_thread(mntinfo_t * mi)4237 failover_thread(mntinfo_t *mi)
4238 {
4239 servinfo_t *svp = NULL;
4240 CLIENT *cl;
4241 enum clnt_stat status;
4242 struct timeval tv;
4243 int error;
4244 int oncethru = 0;
4245 callb_cpr_t cprinfo;
4246 rnode_t *rp;
4247 int index;
4248 char *srvnames;
4249 size_t srvnames_len;
4250 struct nfs_clnt *nfscl = NULL;
4251 zoneid_t zoneid = getzoneid();
4252
4253 #ifdef DEBUG
4254 /*
4255 * This is currently only needed to access counters which exist on
4256 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4257 * on non-DEBUG kernels.
4258 */
4259 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4260 ASSERT(nfscl != NULL);
4261 #endif
4262
4263 /*
4264 * Its safe to piggyback on the mi_lock since failover_newserver()
4265 * code guarantees that there will be only one failover thread
4266 * per mountinfo at any instance.
4267 */
4268 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4269 "failover_thread");
4270
4271 mutex_enter(&mi->mi_lock);
4272 while (mi->mi_readers) {
4273 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4274 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4275 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4276 }
4277 mutex_exit(&mi->mi_lock);
4278
4279 tv.tv_sec = 2;
4280 tv.tv_usec = 0;
4281
4282 /*
4283 * Ping the null NFS procedure of every server in
4284 * the list until one responds. We always start
4285 * at the head of the list and always skip the one
4286 * that is current, since it's caused us a problem.
4287 */
4288 while (svp == NULL) {
4289 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4290 if (!oncethru && svp == mi->mi_curr_serv)
4291 continue;
4292
4293 /*
4294 * If the file system was forcibly umounted
4295 * while trying to do a failover, then just
4296 * give up on the failover. It won't matter
4297 * what the server is.
4298 */
4299 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4300 svp = NULL;
4301 goto done;
4302 }
4303
4304 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4305 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4306 if (error)
4307 continue;
4308
4309 if (!(mi->mi_flags & MI_INT))
4310 cl->cl_nosignal = TRUE;
4311 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4312 xdr_void, NULL, tv);
4313 if (!(mi->mi_flags & MI_INT))
4314 cl->cl_nosignal = FALSE;
4315 AUTH_DESTROY(cl->cl_auth);
4316 CLNT_DESTROY(cl);
4317 if (status == RPC_SUCCESS) {
4318 if (svp == mi->mi_curr_serv) {
4319 #ifdef DEBUG
4320 zcmn_err(zoneid, CE_NOTE,
4321 "NFS%d: failing over: selecting original server %s",
4322 mi->mi_vers, svp->sv_hostname);
4323 #else
4324 zcmn_err(zoneid, CE_NOTE,
4325 "NFS: failing over: selecting original server %s",
4326 svp->sv_hostname);
4327 #endif
4328 } else {
4329 #ifdef DEBUG
4330 zcmn_err(zoneid, CE_NOTE,
4331 "NFS%d: failing over from %s to %s",
4332 mi->mi_vers,
4333 mi->mi_curr_serv->sv_hostname,
4334 svp->sv_hostname);
4335 #else
4336 zcmn_err(zoneid, CE_NOTE,
4337 "NFS: failing over from %s to %s",
4338 mi->mi_curr_serv->sv_hostname,
4339 svp->sv_hostname);
4340 #endif
4341 }
4342 break;
4343 }
4344 }
4345
4346 if (svp == NULL) {
4347 if (!oncethru) {
4348 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4349 #ifdef DEBUG
4350 zprintf(zoneid,
4351 "NFS%d servers %s not responding "
4352 "still trying\n", mi->mi_vers, srvnames);
4353 #else
4354 zprintf(zoneid, "NFS servers %s not responding "
4355 "still trying\n", srvnames);
4356 #endif
4357 oncethru = 1;
4358 }
4359 mutex_enter(&mi->mi_lock);
4360 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4361 mutex_exit(&mi->mi_lock);
4362 delay(hz);
4363 mutex_enter(&mi->mi_lock);
4364 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4365 mutex_exit(&mi->mi_lock);
4366 }
4367 }
4368
4369 if (oncethru) {
4370 #ifdef DEBUG
4371 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4372 #else
4373 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4374 #endif
4375 }
4376
4377 if (svp != mi->mi_curr_serv) {
4378 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4379 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4380 rw_enter(&rtable[index].r_lock, RW_WRITER);
4381 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4382 mi->mi_vfsp);
4383 if (rp != NULL) {
4384 if (rp->r_flags & RHASHED)
4385 rp_rmhash_locked(rp);
4386 rw_exit(&rtable[index].r_lock);
4387 rp->r_server = svp;
4388 rp->r_fh = svp->sv_fhandle;
4389 (void) nfs_free_data_reclaim(rp);
4390 index = rtablehash(&rp->r_fh);
4391 rp->r_hashq = &rtable[index];
4392 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4393 vn_exists(RTOV(rp));
4394 rp_addhash(rp);
4395 rw_exit(&rp->r_hashq->r_lock);
4396 VN_RELE(RTOV(rp));
4397 } else
4398 rw_exit(&rtable[index].r_lock);
4399 }
4400
4401 done:
4402 if (oncethru)
4403 kmem_free(srvnames, srvnames_len);
4404 mutex_enter(&mi->mi_lock);
4405 mi->mi_flags &= ~MI_BINDINPROG;
4406 if (svp != NULL) {
4407 mi->mi_curr_serv = svp;
4408 mi->mi_failover++;
4409 #ifdef DEBUG
4410 nfscl->nfscl_stat.failover.value.ui64++;
4411 #endif
4412 }
4413 cv_broadcast(&mi->mi_failover_cv);
4414 CALLB_CPR_EXIT(&cprinfo);
4415 VFS_RELE(mi->mi_vfsp);
4416 zthread_exit();
4417 /* NOTREACHED */
4418 }
4419
4420 /*
4421 * NFS client failover support
4422 *
4423 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4424 * is cleared, meaning that failover is complete. Called with
4425 * mi_lock mutex held.
4426 */
4427 static int
failover_wait(mntinfo_t * mi)4428 failover_wait(mntinfo_t *mi)
4429 {
4430 k_sigset_t smask;
4431
4432 /*
4433 * If someone else is hunting for a living server,
4434 * sleep until it's done. After our sleep, we may
4435 * be bound to the right server and get off cheaply.
4436 */
4437 while (mi->mi_flags & MI_BINDINPROG) {
4438 /*
4439 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4440 * and SIGTERM. (Preserving the existing masks).
4441 * Mask out SIGINT if mount option nointr is specified.
4442 */
4443 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4444 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4445 /*
4446 * restore original signal mask
4447 */
4448 sigunintr(&smask);
4449 return (EINTR);
4450 }
4451 /*
4452 * restore original signal mask
4453 */
4454 sigunintr(&smask);
4455 }
4456 return (0);
4457 }
4458
4459 /*
4460 * NFS client failover support
4461 *
4462 * failover_remap() will do a partial pathname lookup and find the
4463 * desired vnode on the current server. The interim vnode will be
4464 * discarded after we pilfer the new filehandle.
4465 *
4466 * Side effects:
4467 * - This routine will also update the filehandle in the args structure
4468 * pointed to by the fi->fhp pointer if it is non-NULL.
4469 */
4470
4471 static int
failover_remap(failinfo_t * fi)4472 failover_remap(failinfo_t *fi)
4473 {
4474 vnode_t *vp, *nvp, *rootvp;
4475 rnode_t *rp, *nrp;
4476 mntinfo_t *mi;
4477 int error;
4478 #ifdef DEBUG
4479 struct nfs_clnt *nfscl;
4480
4481 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4482 ASSERT(nfscl != NULL);
4483 #endif
4484 /*
4485 * Sanity check
4486 */
4487 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4488 return (EINVAL);
4489 vp = fi->vp;
4490 rp = VTOR(vp);
4491 mi = VTOMI(vp);
4492
4493 if (!(vp->v_flag & VROOT)) {
4494 /*
4495 * Given the root fh, use the path stored in
4496 * the rnode to find the fh for the new server.
4497 */
4498 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4499 if (error)
4500 return (error);
4501
4502 error = failover_lookup(rp->r_path, rootvp,
4503 fi->lookupproc, fi->xattrdirproc, &nvp);
4504
4505 VN_RELE(rootvp);
4506
4507 if (error)
4508 return (error);
4509
4510 /*
4511 * If we found the same rnode, we're done now
4512 */
4513 if (nvp == vp) {
4514 /*
4515 * Failed and the new server may physically be same
4516 * OR may share a same disk subsystem. In this case
4517 * file handle for a particular file path is not going
4518 * to change, given the same filehandle lookup will
4519 * always locate the same rnode as the existing one.
4520 * All we might need to do is to update the r_server
4521 * with the current servinfo.
4522 */
4523 if (!VALID_FH(fi)) {
4524 rp->r_server = mi->mi_curr_serv;
4525 }
4526 VN_RELE(nvp);
4527 return (0);
4528 }
4529
4530 /*
4531 * Try to make it so that no one else will find this
4532 * vnode because it is just a temporary to hold the
4533 * new file handle until that file handle can be
4534 * copied to the original vnode/rnode.
4535 */
4536 nrp = VTOR(nvp);
4537 mutex_enter(&mi->mi_remap_lock);
4538 /*
4539 * Some other thread could have raced in here and could
4540 * have done the remap for this particular rnode before
4541 * this thread here. Check for rp->r_server and
4542 * mi->mi_curr_serv and return if they are same.
4543 */
4544 if (VALID_FH(fi)) {
4545 mutex_exit(&mi->mi_remap_lock);
4546 VN_RELE(nvp);
4547 return (0);
4548 }
4549
4550 if (nrp->r_flags & RHASHED)
4551 rp_rmhash(nrp);
4552
4553 /*
4554 * As a heuristic check on the validity of the new
4555 * file, check that the size and type match against
4556 * that we remember from the old version.
4557 */
4558 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4559 mutex_exit(&mi->mi_remap_lock);
4560 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4561 "NFS replicas %s and %s: file %s not same.",
4562 rp->r_server->sv_hostname,
4563 nrp->r_server->sv_hostname, rp->r_path);
4564 VN_RELE(nvp);
4565 return (EINVAL);
4566 }
4567
4568 /*
4569 * snarf the filehandle from the new rnode
4570 * then release it, again while updating the
4571 * hash queues for the rnode.
4572 */
4573 if (rp->r_flags & RHASHED)
4574 rp_rmhash(rp);
4575 rp->r_server = mi->mi_curr_serv;
4576 rp->r_fh = nrp->r_fh;
4577 rp->r_hashq = nrp->r_hashq;
4578 /*
4579 * Copy the attributes from the new rnode to the old
4580 * rnode. This will help to reduce unnecessary page
4581 * cache flushes.
4582 */
4583 rp->r_attr = nrp->r_attr;
4584 rp->r_attrtime = nrp->r_attrtime;
4585 rp->r_mtime = nrp->r_mtime;
4586 (void) nfs_free_data_reclaim(rp);
4587 nfs_setswaplike(vp, &rp->r_attr);
4588 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4589 rp_addhash(rp);
4590 rw_exit(&rp->r_hashq->r_lock);
4591 mutex_exit(&mi->mi_remap_lock);
4592 VN_RELE(nvp);
4593 }
4594
4595 /*
4596 * Update successful failover remap count
4597 */
4598 mutex_enter(&mi->mi_lock);
4599 mi->mi_remap++;
4600 mutex_exit(&mi->mi_lock);
4601 #ifdef DEBUG
4602 nfscl->nfscl_stat.remap.value.ui64++;
4603 #endif
4604
4605 /*
4606 * If we have a copied filehandle to update, do it now.
4607 */
4608 if (fi->fhp != NULL && fi->copyproc != NULL)
4609 (*fi->copyproc)(fi->fhp, vp);
4610
4611 return (0);
4612 }
4613
4614 /*
4615 * NFS client failover support
4616 *
4617 * We want a simple pathname lookup routine to parse the pieces
4618 * of path in rp->r_path. We know that the path was a created
4619 * as rnodes were made, so we know we have only to deal with
4620 * paths that look like:
4621 * dir1/dir2/dir3/file
4622 * Any evidence of anything like .., symlinks, and ENOTDIR
4623 * are hard errors, because they mean something in this filesystem
4624 * is different from the one we came from, or has changed under
4625 * us in some way. If this is true, we want the failure.
4626 *
4627 * Extended attributes: if the filesystem is mounted with extended
4628 * attributes enabled (-o xattr), the attribute directory will be
4629 * represented in the r_path as the magic name XATTR_RPATH. So if
4630 * we see that name in the pathname, is must be because this node
4631 * is an extended attribute. Therefore, look it up that way.
4632 */
4633 static int
failover_lookup(char * path,vnode_t * root,int (* lookupproc)(vnode_t *,char *,vnode_t **,struct pathname *,int,vnode_t *,cred_t *,int),int (* xattrdirproc)(vnode_t *,vnode_t **,bool_t,cred_t *,int),vnode_t ** new)4634 failover_lookup(char *path, vnode_t *root,
4635 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4636 vnode_t *, cred_t *, int),
4637 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4638 vnode_t **new)
4639 {
4640 vnode_t *dvp, *nvp;
4641 int error = EINVAL;
4642 char *s, *p, *tmppath;
4643 size_t len;
4644 mntinfo_t *mi;
4645 bool_t xattr;
4646
4647 /* Make local copy of path */
4648 len = strlen(path) + 1;
4649 tmppath = kmem_alloc(len, KM_SLEEP);
4650 (void) strcpy(tmppath, path);
4651 s = tmppath;
4652
4653 dvp = root;
4654 VN_HOLD(dvp);
4655 mi = VTOMI(root);
4656 xattr = mi->mi_flags & MI_EXTATTR;
4657
4658 do {
4659 p = strchr(s, '/');
4660 if (p != NULL)
4661 *p = '\0';
4662 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4663 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4664 RFSCALL_SOFT);
4665 } else {
4666 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4667 CRED(), RFSCALL_SOFT);
4668 }
4669 if (p != NULL)
4670 *p++ = '/';
4671 if (error) {
4672 VN_RELE(dvp);
4673 kmem_free(tmppath, len);
4674 return (error);
4675 }
4676 s = p;
4677 VN_RELE(dvp);
4678 dvp = nvp;
4679 } while (p != NULL);
4680
4681 if (nvp != NULL && new != NULL)
4682 *new = nvp;
4683 kmem_free(tmppath, len);
4684 return (0);
4685 }
4686
4687 /*
4688 * NFS client failover support
4689 *
4690 * sv_free() frees the malloc'd portion of a "servinfo_t".
4691 */
4692 void
sv_free(servinfo_t * svp)4693 sv_free(servinfo_t *svp)
4694 {
4695 servinfo_t *next;
4696 struct knetconfig *knconf;
4697
4698 while (svp != NULL) {
4699 next = svp->sv_next;
4700 if (svp->sv_secdata)
4701 sec_clnt_freeinfo(svp->sv_secdata);
4702 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4703 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4704 knconf = svp->sv_knconf;
4705 if (knconf != NULL) {
4706 if (knconf->knc_protofmly != NULL)
4707 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4708 if (knconf->knc_proto != NULL)
4709 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4710 kmem_free(knconf, sizeof (*knconf));
4711 }
4712 knconf = svp->sv_origknconf;
4713 if (knconf != NULL) {
4714 if (knconf->knc_protofmly != NULL)
4715 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4716 if (knconf->knc_proto != NULL)
4717 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4718 kmem_free(knconf, sizeof (*knconf));
4719 }
4720 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4721 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4722 mutex_destroy(&svp->sv_lock);
4723 kmem_free(svp, sizeof (*svp));
4724 svp = next;
4725 }
4726 }
4727
4728 /*
4729 * Only can return non-zero if intr != 0.
4730 */
4731 int
nfs_rw_enter_sig(nfs_rwlock_t * l,krw_t rw,int intr)4732 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4733 {
4734
4735 mutex_enter(&l->lock);
4736
4737 /*
4738 * If this is a nested enter, then allow it. There
4739 * must be as many exits as enters through.
4740 */
4741 if (l->owner == curthread) {
4742 /* lock is held for writing by current thread */
4743 ASSERT(rw == RW_READER || rw == RW_WRITER);
4744 l->count--;
4745 } else if (rw == RW_READER) {
4746 /*
4747 * While there is a writer active or writers waiting,
4748 * then wait for them to finish up and move on. Then,
4749 * increment the count to indicate that a reader is
4750 * active.
4751 */
4752 while (l->count < 0 || l->waiters > 0) {
4753 if (intr) {
4754 klwp_t *lwp = ttolwp(curthread);
4755
4756 if (lwp != NULL)
4757 lwp->lwp_nostop++;
4758 if (!cv_wait_sig(&l->cv, &l->lock)) {
4759 if (lwp != NULL)
4760 lwp->lwp_nostop--;
4761 mutex_exit(&l->lock);
4762 return (EINTR);
4763 }
4764 if (lwp != NULL)
4765 lwp->lwp_nostop--;
4766 } else
4767 cv_wait(&l->cv, &l->lock);
4768 }
4769 ASSERT(l->count < INT_MAX);
4770 #ifdef DEBUG
4771 if ((l->count % 10000) == 9999)
4772 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4773 "rwlock @ %p\n", l->count, (void *)&l);
4774 #endif
4775 l->count++;
4776 } else {
4777 ASSERT(rw == RW_WRITER);
4778 /*
4779 * While there are readers active or a writer
4780 * active, then wait for all of the readers
4781 * to finish or for the writer to finish.
4782 * Then, set the owner field to curthread and
4783 * decrement count to indicate that a writer
4784 * is active.
4785 */
4786 while (l->count > 0 || l->owner != NULL) {
4787 l->waiters++;
4788 if (intr) {
4789 klwp_t *lwp = ttolwp(curthread);
4790
4791 if (lwp != NULL)
4792 lwp->lwp_nostop++;
4793 if (!cv_wait_sig(&l->cv, &l->lock)) {
4794 if (lwp != NULL)
4795 lwp->lwp_nostop--;
4796 l->waiters--;
4797 cv_broadcast(&l->cv);
4798 mutex_exit(&l->lock);
4799 return (EINTR);
4800 }
4801 if (lwp != NULL)
4802 lwp->lwp_nostop--;
4803 } else
4804 cv_wait(&l->cv, &l->lock);
4805 l->waiters--;
4806 }
4807 l->owner = curthread;
4808 l->count--;
4809 }
4810
4811 mutex_exit(&l->lock);
4812
4813 return (0);
4814 }
4815
4816 /*
4817 * If the lock is available, obtain it and return non-zero. If there is
4818 * already a conflicting lock, return 0 immediately.
4819 */
4820
4821 int
nfs_rw_tryenter(nfs_rwlock_t * l,krw_t rw)4822 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4823 {
4824 mutex_enter(&l->lock);
4825
4826 /*
4827 * If this is a nested enter, then allow it. There
4828 * must be as many exits as enters through.
4829 */
4830 if (l->owner == curthread) {
4831 /* lock is held for writing by current thread */
4832 ASSERT(rw == RW_READER || rw == RW_WRITER);
4833 l->count--;
4834 } else if (rw == RW_READER) {
4835 /*
4836 * If there is a writer active or writers waiting, deny the
4837 * lock. Otherwise, bump the count of readers.
4838 */
4839 if (l->count < 0 || l->waiters > 0) {
4840 mutex_exit(&l->lock);
4841 return (0);
4842 }
4843 l->count++;
4844 } else {
4845 ASSERT(rw == RW_WRITER);
4846 /*
4847 * If there are readers active or a writer active, deny the
4848 * lock. Otherwise, set the owner field to curthread and
4849 * decrement count to indicate that a writer is active.
4850 */
4851 if (l->count > 0 || l->owner != NULL) {
4852 mutex_exit(&l->lock);
4853 return (0);
4854 }
4855 l->owner = curthread;
4856 l->count--;
4857 }
4858
4859 mutex_exit(&l->lock);
4860
4861 return (1);
4862 }
4863
4864 void
nfs_rw_exit(nfs_rwlock_t * l)4865 nfs_rw_exit(nfs_rwlock_t *l)
4866 {
4867
4868 mutex_enter(&l->lock);
4869 /*
4870 * If this is releasing a writer lock, then increment count to
4871 * indicate that there is one less writer active. If this was
4872 * the last of possibly nested writer locks, then clear the owner
4873 * field as well to indicate that there is no writer active
4874 * and wakeup any possible waiting writers or readers.
4875 *
4876 * If releasing a reader lock, then just decrement count to
4877 * indicate that there is one less reader active. If this was
4878 * the last active reader and there are writer(s) waiting,
4879 * then wake up the first.
4880 */
4881 if (l->owner != NULL) {
4882 ASSERT(l->owner == curthread);
4883 l->count++;
4884 if (l->count == 0) {
4885 l->owner = NULL;
4886 cv_broadcast(&l->cv);
4887 }
4888 } else {
4889 ASSERT(l->count > 0);
4890 l->count--;
4891 if (l->count == 0 && l->waiters > 0)
4892 cv_broadcast(&l->cv);
4893 }
4894 mutex_exit(&l->lock);
4895 }
4896
4897 int
nfs_rw_lock_held(nfs_rwlock_t * l,krw_t rw)4898 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4899 {
4900
4901 if (rw == RW_READER)
4902 return (l->count > 0);
4903 ASSERT(rw == RW_WRITER);
4904 return (l->count < 0);
4905 }
4906
4907 /* ARGSUSED */
4908 void
nfs_rw_init(nfs_rwlock_t * l,char * name,krw_type_t type,void * arg)4909 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4910 {
4911
4912 l->count = 0;
4913 l->waiters = 0;
4914 l->owner = NULL;
4915 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4916 cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4917 }
4918
4919 void
nfs_rw_destroy(nfs_rwlock_t * l)4920 nfs_rw_destroy(nfs_rwlock_t *l)
4921 {
4922
4923 mutex_destroy(&l->lock);
4924 cv_destroy(&l->cv);
4925 }
4926
4927 int
nfs3_rddir_compar(const void * x,const void * y)4928 nfs3_rddir_compar(const void *x, const void *y)
4929 {
4930 rddir_cache *a = (rddir_cache *)x;
4931 rddir_cache *b = (rddir_cache *)y;
4932
4933 if (a->nfs3_cookie == b->nfs3_cookie) {
4934 if (a->buflen == b->buflen)
4935 return (0);
4936 if (a->buflen < b->buflen)
4937 return (-1);
4938 return (1);
4939 }
4940
4941 if (a->nfs3_cookie < b->nfs3_cookie)
4942 return (-1);
4943
4944 return (1);
4945 }
4946
4947 int
nfs_rddir_compar(const void * x,const void * y)4948 nfs_rddir_compar(const void *x, const void *y)
4949 {
4950 rddir_cache *a = (rddir_cache *)x;
4951 rddir_cache *b = (rddir_cache *)y;
4952
4953 if (a->nfs_cookie == b->nfs_cookie) {
4954 if (a->buflen == b->buflen)
4955 return (0);
4956 if (a->buflen < b->buflen)
4957 return (-1);
4958 return (1);
4959 }
4960
4961 if (a->nfs_cookie < b->nfs_cookie)
4962 return (-1);
4963
4964 return (1);
4965 }
4966
4967 static char *
nfs_getsrvnames(mntinfo_t * mi,size_t * len)4968 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
4969 {
4970 servinfo_t *s;
4971 char *srvnames;
4972 char *namep;
4973 size_t length;
4974
4975 /*
4976 * Calculate the length of the string required to hold all
4977 * of the server names plus either a comma or a null
4978 * character following each individual one.
4979 */
4980 length = 0;
4981 for (s = mi->mi_servers; s != NULL; s = s->sv_next)
4982 length += s->sv_hostnamelen;
4983
4984 srvnames = kmem_alloc(length, KM_SLEEP);
4985
4986 namep = srvnames;
4987 for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
4988 (void) strcpy(namep, s->sv_hostname);
4989 namep += s->sv_hostnamelen - 1;
4990 *namep++ = ',';
4991 }
4992 *--namep = '\0';
4993
4994 *len = length;
4995
4996 return (srvnames);
4997 }
4998
4999 /*
5000 * These two functions are temporary and designed for the upgrade-workaround
5001 * only. They cannot be used for general zone-crossing NFS client support, and
5002 * will be removed shortly.
5003 *
5004 * When the workaround is enabled, all NFS traffic is forced into the global
5005 * zone. These functions are called when the code needs to refer to the state
5006 * of the underlying network connection. They're not called when the function
5007 * needs to refer to the state of the process that invoked the system call.
5008 * (E.g., when checking whether the zone is shutting down during the mount()
5009 * call.)
5010 */
5011
5012 struct zone *
nfs_zone(void)5013 nfs_zone(void)
5014 {
5015 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5016 }
5017
5018 zoneid_t
nfs_zoneid(void)5019 nfs_zoneid(void)
5020 {
5021 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5022 }
5023
5024 /*
5025 * nfs_mount_label_policy:
5026 * Determine whether the mount is allowed according to MAC check,
5027 * by comparing (where appropriate) label of the remote server
5028 * against the label of the zone being mounted into.
5029 *
5030 * Returns:
5031 * 0 : access allowed
5032 * -1 : read-only access allowed (i.e., read-down)
5033 * >0 : error code, such as EACCES
5034 */
5035 int
nfs_mount_label_policy(vfs_t * vfsp,struct netbuf * addr,struct knetconfig * knconf,cred_t * cr)5036 nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr,
5037 struct knetconfig *knconf, cred_t *cr)
5038 {
5039 int addr_type;
5040 void *ipaddr;
5041 bslabel_t *server_sl, *mntlabel;
5042 zone_t *mntzone = NULL;
5043 ts_label_t *zlabel;
5044 tsol_tpc_t *tp;
5045 ts_label_t *tsl = NULL;
5046 int retv;
5047
5048 /*
5049 * Get the zone's label. Each zone on a labeled system has a label.
5050 */
5051 mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
5052 zlabel = mntzone->zone_slabel;
5053 ASSERT(zlabel != NULL);
5054 label_hold(zlabel);
5055
5056 if (strcmp(knconf->knc_protofmly, NC_INET) == 0) {
5057 addr_type = IPV4_VERSION;
5058 ipaddr = &((struct sockaddr_in *)addr->buf)->sin_addr;
5059 } else if (strcmp(knconf->knc_protofmly, NC_INET6) == 0) {
5060 addr_type = IPV6_VERSION;
5061 ipaddr = &((struct sockaddr_in6 *)addr->buf)->sin6_addr;
5062 } else {
5063 retv = 0;
5064 goto out;
5065 }
5066
5067 retv = EACCES; /* assume the worst */
5068
5069 /*
5070 * Next, get the assigned label of the remote server.
5071 */
5072 tp = find_tpc(ipaddr, addr_type, B_FALSE);
5073 if (tp == NULL)
5074 goto out; /* error getting host entry */
5075
5076 if (tp->tpc_tp.tp_doi != zlabel->tsl_doi)
5077 goto rel_tpc; /* invalid domain */
5078 if ((tp->tpc_tp.host_type != SUN_CIPSO) &&
5079 (tp->tpc_tp.host_type != UNLABELED))
5080 goto rel_tpc; /* invalid hosttype */
5081
5082 if (tp->tpc_tp.host_type == SUN_CIPSO) {
5083 tsl = getflabel_cipso(vfsp);
5084 if (tsl == NULL)
5085 goto rel_tpc; /* error getting server lbl */
5086
5087 server_sl = label2bslabel(tsl);
5088 } else { /* UNLABELED */
5089 server_sl = &tp->tpc_tp.tp_def_label;
5090 }
5091
5092 mntlabel = label2bslabel(zlabel);
5093
5094 /*
5095 * Now compare labels to complete the MAC check. If the labels
5096 * are equal or if the requestor is in the global zone and has
5097 * NET_MAC_AWARE, then allow read-write access. (Except for
5098 * mounts into the global zone itself; restrict these to
5099 * read-only.)
5100 *
5101 * If the requestor is in some other zone, but his label
5102 * dominates the server, then allow read-down.
5103 *
5104 * Otherwise, access is denied.
5105 */
5106 if (blequal(mntlabel, server_sl) ||
5107 (crgetzoneid(cr) == GLOBAL_ZONEID &&
5108 getpflags(NET_MAC_AWARE, cr) != 0)) {
5109 if ((mntzone == global_zone) ||
5110 !blequal(mntlabel, server_sl))
5111 retv = -1; /* read-only */
5112 else
5113 retv = 0; /* access OK */
5114 } else if (bldominates(mntlabel, server_sl)) {
5115 retv = -1; /* read-only */
5116 } else {
5117 retv = EACCES;
5118 }
5119
5120 if (tsl != NULL)
5121 label_rele(tsl);
5122
5123 rel_tpc:
5124 TPC_RELE(tp);
5125 out:
5126 if (mntzone)
5127 zone_rele(mntzone);
5128 label_rele(zlabel);
5129 return (retv);
5130 }
5131
5132 boolean_t
nfs_has_ctty(void)5133 nfs_has_ctty(void)
5134 {
5135 boolean_t rv;
5136 mutex_enter(&curproc->p_splock);
5137 rv = (curproc->p_sessp->s_vp != NULL);
5138 mutex_exit(&curproc->p_splock);
5139 return (rv);
5140 }
5141
5142 /*
5143 * See if xattr directory to see if it has any generic user attributes
5144 */
5145 int
do_xattr_exists_check(vnode_t * vp,ulong_t * valp,cred_t * cr)5146 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5147 {
5148 struct uio uio;
5149 struct iovec iov;
5150 char *dbuf;
5151 struct dirent64 *dp;
5152 size_t dlen = 8 * 1024;
5153 size_t dbuflen;
5154 int eof = 0;
5155 int error;
5156
5157 *valp = 0;
5158 dbuf = kmem_alloc(dlen, KM_SLEEP);
5159 uio.uio_iov = &iov;
5160 uio.uio_iovcnt = 1;
5161 uio.uio_segflg = UIO_SYSSPACE;
5162 uio.uio_fmode = 0;
5163 uio.uio_extflg = UIO_COPY_CACHED;
5164 uio.uio_loffset = 0;
5165 uio.uio_resid = dlen;
5166 iov.iov_base = dbuf;
5167 iov.iov_len = dlen;
5168 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
5169 error = VOP_READDIR(vp, &uio, cr, &eof, NULL, 0);
5170 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
5171
5172 dbuflen = dlen - uio.uio_resid;
5173
5174 if (error || dbuflen == 0) {
5175 kmem_free(dbuf, dlen);
5176 return (error);
5177 }
5178
5179 dp = (dirent64_t *)dbuf;
5180
5181 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5182 if (strcmp(dp->d_name, ".") == 0 ||
5183 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5184 VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5185 VIEW_READONLY) == 0) {
5186 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5187 continue;
5188 }
5189
5190 *valp = 1;
5191 break;
5192 }
5193 kmem_free(dbuf, dlen);
5194 return (0);
5195 }
5196