1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 * All Rights Reserved
29 */
30
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cmn_err.h>
35 #include <sys/vtrace.h>
36 #include <sys/session.h>
37 #include <sys/thread.h>
38 #include <sys/dnlc.h>
39 #include <sys/cred.h>
40 #include <sys/priv.h>
41 #include <sys/list.h>
42 #include <sys/sdt.h>
43 #include <sys/policy.h>
44
45 #include <rpc/types.h>
46 #include <rpc/xdr.h>
47
48 #include <nfs/nfs.h>
49
50 #include <nfs/nfs_clnt.h>
51
52 #include <nfs/nfs4.h>
53 #include <nfs/rnode4.h>
54 #include <nfs/nfs4_clnt.h>
55
56 /*
57 * client side statistics
58 */
59 static const struct clstat4 clstat4_tmpl = {
60 { "calls", KSTAT_DATA_UINT64 },
61 { "badcalls", KSTAT_DATA_UINT64 },
62 { "referrals", KSTAT_DATA_UINT64 },
63 { "referlinks", KSTAT_DATA_UINT64 },
64 { "clgets", KSTAT_DATA_UINT64 },
65 { "cltoomany", KSTAT_DATA_UINT64 },
66 #ifdef DEBUG
67 { "clalloc", KSTAT_DATA_UINT64 },
68 { "noresponse", KSTAT_DATA_UINT64 },
69 { "failover", KSTAT_DATA_UINT64 },
70 { "remap", KSTAT_DATA_UINT64 },
71 #endif
72 };
73
74 #ifdef DEBUG
75 struct clstat4_debug clstat4_debug = {
76 { "nrnode", KSTAT_DATA_UINT64 },
77 { "access", KSTAT_DATA_UINT64 },
78 { "dirent", KSTAT_DATA_UINT64 },
79 { "dirents", KSTAT_DATA_UINT64 },
80 { "reclaim", KSTAT_DATA_UINT64 },
81 { "clreclaim", KSTAT_DATA_UINT64 },
82 { "f_reclaim", KSTAT_DATA_UINT64 },
83 { "a_reclaim", KSTAT_DATA_UINT64 },
84 { "r_reclaim", KSTAT_DATA_UINT64 },
85 { "r_path", KSTAT_DATA_UINT64 },
86 };
87 #endif
88
89 /*
90 * We keep a global list of per-zone client data, so we can clean up all zones
91 * if we get low on memory.
92 */
93 static list_t nfs4_clnt_list;
94 static kmutex_t nfs4_clnt_list_lock;
95 zone_key_t nfs4clnt_zone_key;
96
97 static struct kmem_cache *chtab4_cache;
98
99 #ifdef DEBUG
100 static int nfs4_rfscall_debug;
101 static int nfs4_try_failover_any;
102 int nfs4_utf8_debug = 0;
103 #endif
104
105 /*
106 * NFSv4 readdir cache implementation
107 */
108 typedef struct rddir4_cache_impl {
109 rddir4_cache rc; /* readdir cache element */
110 kmutex_t lock; /* lock protects count */
111 uint_t count; /* reference count */
112 avl_node_t tree; /* AVL tree link */
113 } rddir4_cache_impl;
114
115 static int rddir4_cache_compar(const void *, const void *);
116 static void rddir4_cache_free(rddir4_cache_impl *);
117 static rddir4_cache *rddir4_cache_alloc(int);
118 static void rddir4_cache_hold(rddir4_cache *);
119 static int try_failover(enum clnt_stat);
120
121 static int nfs4_readdir_cache_hits = 0;
122 static int nfs4_readdir_cache_waits = 0;
123 static int nfs4_readdir_cache_misses = 0;
124
125 /*
126 * Shared nfs4 functions
127 */
128
129 /*
130 * Copy an nfs_fh4. The destination storage (to->nfs_fh4_val) must already
131 * be allocated.
132 */
133
134 void
nfs_fh4_copy(nfs_fh4 * from,nfs_fh4 * to)135 nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
136 {
137 to->nfs_fh4_len = from->nfs_fh4_len;
138 bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
139 }
140
141 /*
142 * nfs4cmpfh - compare 2 filehandles.
143 * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
144 * "less" than the second, +1 if the first is "greater" than the second.
145 */
146
147 int
nfs4cmpfh(const nfs_fh4 * fh4p1,const nfs_fh4 * fh4p2)148 nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
149 {
150 const char *c1, *c2;
151
152 if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
153 return (-1);
154 if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
155 return (1);
156 for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
157 c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
158 c1++, c2++) {
159 if (*c1 < *c2)
160 return (-1);
161 if (*c1 > *c2)
162 return (1);
163 }
164
165 return (0);
166 }
167
168 /*
169 * Compare two v4 filehandles. Return zero if they're the same, non-zero
170 * if they're not. Like nfs4cmpfh(), but different filehandle
171 * representation, and doesn't provide information about greater than or
172 * less than.
173 */
174
175 int
nfs4cmpfhandle(nfs4_fhandle_t * fh1,nfs4_fhandle_t * fh2)176 nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
177 {
178 if (fh1->fh_len == fh2->fh_len)
179 return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));
180
181 return (1);
182 }
183
184 int
stateid4_cmp(stateid4 * s1,stateid4 * s2)185 stateid4_cmp(stateid4 *s1, stateid4 *s2)
186 {
187 if (bcmp(s1, s2, sizeof (stateid4)) == 0)
188 return (1);
189 else
190 return (0);
191 }
192
193 nfsstat4
puterrno4(int error)194 puterrno4(int error)
195 {
196 switch (error) {
197 case 0:
198 return (NFS4_OK);
199 case EPERM:
200 return (NFS4ERR_PERM);
201 case ENOENT:
202 return (NFS4ERR_NOENT);
203 case EINTR:
204 return (NFS4ERR_IO);
205 case EIO:
206 return (NFS4ERR_IO);
207 case ENXIO:
208 return (NFS4ERR_NXIO);
209 case ENOMEM:
210 return (NFS4ERR_RESOURCE);
211 case EACCES:
212 return (NFS4ERR_ACCESS);
213 case EBUSY:
214 return (NFS4ERR_IO);
215 case EEXIST:
216 return (NFS4ERR_EXIST);
217 case EXDEV:
218 return (NFS4ERR_XDEV);
219 case ENODEV:
220 return (NFS4ERR_IO);
221 case ENOTDIR:
222 return (NFS4ERR_NOTDIR);
223 case EISDIR:
224 return (NFS4ERR_ISDIR);
225 case EINVAL:
226 return (NFS4ERR_INVAL);
227 case EMFILE:
228 return (NFS4ERR_RESOURCE);
229 case EFBIG:
230 return (NFS4ERR_FBIG);
231 case ENOSPC:
232 return (NFS4ERR_NOSPC);
233 case EROFS:
234 return (NFS4ERR_ROFS);
235 case EMLINK:
236 return (NFS4ERR_MLINK);
237 case EDEADLK:
238 return (NFS4ERR_DEADLOCK);
239 case ENOLCK:
240 return (NFS4ERR_DENIED);
241 case EREMOTE:
242 return (NFS4ERR_SERVERFAULT);
243 case ENOTSUP:
244 return (NFS4ERR_NOTSUPP);
245 case EDQUOT:
246 return (NFS4ERR_DQUOT);
247 case ENAMETOOLONG:
248 return (NFS4ERR_NAMETOOLONG);
249 case EOVERFLOW:
250 return (NFS4ERR_INVAL);
251 case ENOSYS:
252 return (NFS4ERR_NOTSUPP);
253 case ENOTEMPTY:
254 return (NFS4ERR_NOTEMPTY);
255 case EOPNOTSUPP:
256 return (NFS4ERR_NOTSUPP);
257 case ESTALE:
258 return (NFS4ERR_STALE);
259 case EAGAIN:
260 if (curthread->t_flag & T_WOULDBLOCK) {
261 curthread->t_flag &= ~T_WOULDBLOCK;
262 return (NFS4ERR_DELAY);
263 }
264 return (NFS4ERR_LOCKED);
265 default:
266 return ((enum nfsstat4)error);
267 }
268 }
269
270 int
geterrno4(enum nfsstat4 status)271 geterrno4(enum nfsstat4 status)
272 {
273 switch (status) {
274 case NFS4_OK:
275 return (0);
276 case NFS4ERR_PERM:
277 return (EPERM);
278 case NFS4ERR_NOENT:
279 return (ENOENT);
280 case NFS4ERR_IO:
281 return (EIO);
282 case NFS4ERR_NXIO:
283 return (ENXIO);
284 case NFS4ERR_ACCESS:
285 return (EACCES);
286 case NFS4ERR_EXIST:
287 return (EEXIST);
288 case NFS4ERR_XDEV:
289 return (EXDEV);
290 case NFS4ERR_NOTDIR:
291 return (ENOTDIR);
292 case NFS4ERR_ISDIR:
293 return (EISDIR);
294 case NFS4ERR_INVAL:
295 return (EINVAL);
296 case NFS4ERR_FBIG:
297 return (EFBIG);
298 case NFS4ERR_NOSPC:
299 return (ENOSPC);
300 case NFS4ERR_ROFS:
301 return (EROFS);
302 case NFS4ERR_MLINK:
303 return (EMLINK);
304 case NFS4ERR_NAMETOOLONG:
305 return (ENAMETOOLONG);
306 case NFS4ERR_NOTEMPTY:
307 return (ENOTEMPTY);
308 case NFS4ERR_DQUOT:
309 return (EDQUOT);
310 case NFS4ERR_STALE:
311 return (ESTALE);
312 case NFS4ERR_BADHANDLE:
313 return (ESTALE);
314 case NFS4ERR_BAD_COOKIE:
315 return (EINVAL);
316 case NFS4ERR_NOTSUPP:
317 return (EOPNOTSUPP);
318 case NFS4ERR_TOOSMALL:
319 return (EINVAL);
320 case NFS4ERR_SERVERFAULT:
321 return (EIO);
322 case NFS4ERR_BADTYPE:
323 return (EINVAL);
324 case NFS4ERR_DELAY:
325 return (ENXIO);
326 case NFS4ERR_SAME:
327 return (EPROTO);
328 case NFS4ERR_DENIED:
329 return (ENOLCK);
330 case NFS4ERR_EXPIRED:
331 return (EPROTO);
332 case NFS4ERR_LOCKED:
333 return (EACCES);
334 case NFS4ERR_GRACE:
335 return (EAGAIN);
336 case NFS4ERR_FHEXPIRED: /* if got here, failed to get a new fh */
337 return (ESTALE);
338 case NFS4ERR_SHARE_DENIED:
339 return (EACCES);
340 case NFS4ERR_WRONGSEC:
341 return (EPERM);
342 case NFS4ERR_CLID_INUSE:
343 return (EAGAIN);
344 case NFS4ERR_RESOURCE:
345 return (EAGAIN);
346 case NFS4ERR_MOVED:
347 return (EPROTO);
348 case NFS4ERR_NOFILEHANDLE:
349 return (EIO);
350 case NFS4ERR_MINOR_VERS_MISMATCH:
351 return (ENOTSUP);
352 case NFS4ERR_STALE_CLIENTID:
353 return (EIO);
354 case NFS4ERR_STALE_STATEID:
355 return (EIO);
356 case NFS4ERR_OLD_STATEID:
357 return (EIO);
358 case NFS4ERR_BAD_STATEID:
359 return (EIO);
360 case NFS4ERR_BAD_SEQID:
361 return (EIO);
362 case NFS4ERR_NOT_SAME:
363 return (EPROTO);
364 case NFS4ERR_LOCK_RANGE:
365 return (EPROTO);
366 case NFS4ERR_SYMLINK:
367 return (EPROTO);
368 case NFS4ERR_RESTOREFH:
369 return (EPROTO);
370 case NFS4ERR_LEASE_MOVED:
371 return (EPROTO);
372 case NFS4ERR_ATTRNOTSUPP:
373 return (ENOTSUP);
374 case NFS4ERR_NO_GRACE:
375 return (EPROTO);
376 case NFS4ERR_RECLAIM_BAD:
377 return (EPROTO);
378 case NFS4ERR_RECLAIM_CONFLICT:
379 return (EPROTO);
380 case NFS4ERR_BADXDR:
381 return (EINVAL);
382 case NFS4ERR_LOCKS_HELD:
383 return (EIO);
384 case NFS4ERR_OPENMODE:
385 return (EACCES);
386 case NFS4ERR_BADOWNER:
387 /*
388 * Client and server are in different DNS domains
389 * and the NFSMAPID_DOMAIN in /etc/default/nfs
390 * doesn't match. No good answer here. Return
391 * EACCESS, which translates to "permission denied".
392 */
393 return (EACCES);
394 case NFS4ERR_BADCHAR:
395 return (EINVAL);
396 case NFS4ERR_BADNAME:
397 return (EINVAL);
398 case NFS4ERR_BAD_RANGE:
399 return (EIO);
400 case NFS4ERR_LOCK_NOTSUPP:
401 return (ENOTSUP);
402 case NFS4ERR_OP_ILLEGAL:
403 return (EINVAL);
404 case NFS4ERR_DEADLOCK:
405 return (EDEADLK);
406 case NFS4ERR_FILE_OPEN:
407 return (EACCES);
408 case NFS4ERR_ADMIN_REVOKED:
409 return (EPROTO);
410 case NFS4ERR_CB_PATH_DOWN:
411 return (EPROTO);
412 default:
413 #ifdef DEBUG
414 zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
415 status);
416 #endif
417 return ((int)status);
418 }
419 }
420
421 void
nfs4_log_badowner(mntinfo4_t * mi,nfs_opnum4 op)422 nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
423 {
424 nfs4_server_t *server;
425
426 /*
427 * Return if already printed/queued a msg
428 * for this mount point.
429 */
430 if (mi->mi_flags & MI4_BADOWNER_DEBUG)
431 return;
432 /*
433 * Happens once per client <-> server pair.
434 */
435 if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
436 mi->mi_flags & MI4_INT))
437 return;
438
439 server = find_nfs4_server(mi);
440 if (server == NULL) {
441 nfs_rw_exit(&mi->mi_recovlock);
442 return;
443 }
444
445 if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
446 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
447 "!NFSMAPID_DOMAIN does not match"
448 " the server: %s domain.\n"
449 "Please check configuration",
450 mi->mi_curr_serv->sv_hostname);
451 server->s_flags |= N4S_BADOWNER_DEBUG;
452 }
453 mutex_exit(&server->s_lock);
454 nfs4_server_rele(server);
455 nfs_rw_exit(&mi->mi_recovlock);
456
457 /*
458 * Happens once per mntinfo4_t.
459 * This error is deemed as one of the recovery facts "RF_BADOWNER",
460 * queue this in the mesg queue for this mount_info. This message
461 * is not printed, meaning its absent from id_to_dump_solo_fact()
462 * but its there for inspection if the queue is ever dumped/inspected.
463 */
464 mutex_enter(&mi->mi_lock);
465 if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
466 nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
467 FALSE, NULL, 0, NULL);
468 mi->mi_flags |= MI4_BADOWNER_DEBUG;
469 }
470 mutex_exit(&mi->mi_lock);
471 }
472
473 int
nfs4_time_ntov(nfstime4 * ntime,timestruc_t * vatime)474 nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
475 {
476 int64_t sec;
477 int32_t nsec;
478
479 /*
480 * Here check that the nfsv4 time is valid for the system.
481 * nfsv4 time value is a signed 64-bit, and the system time
482 * may be either int64_t or int32_t (depends on the kernel),
483 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
484 */
485 #ifndef _LP64
486 if (! NFS4_TIME_OK(ntime->seconds)) {
487 return (EOVERFLOW);
488 }
489 #endif
490
491 /* Invalid to specify 1 billion (or more) nsecs */
492 if (ntime->nseconds >= 1000000000)
493 return (EINVAL);
494
495 if (ntime->seconds < 0) {
496 sec = ntime->seconds + 1;
497 nsec = -1000000000 + ntime->nseconds;
498 } else {
499 sec = ntime->seconds;
500 nsec = ntime->nseconds;
501 }
502
503 vatime->tv_sec = sec;
504 vatime->tv_nsec = nsec;
505
506 return (0);
507 }
508
509 int
nfs4_time_vton(timestruc_t * vatime,nfstime4 * ntime)510 nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
511 {
512 int64_t sec;
513 uint32_t nsec;
514
515 /*
516 * nfsv4 time value is a signed 64-bit, and the system time
517 * may be either int64_t or int32_t (depends on the kernel),
518 * so all system time values will fit.
519 */
520 if (vatime->tv_nsec >= 0) {
521 sec = vatime->tv_sec;
522 nsec = vatime->tv_nsec;
523 } else {
524 sec = vatime->tv_sec - 1;
525 nsec = 1000000000 + vatime->tv_nsec;
526 }
527 ntime->seconds = sec;
528 ntime->nseconds = nsec;
529
530 return (0);
531 }
532
533 /*
534 * Converts a utf8 string to a valid null terminated filename string.
535 *
536 * XXX - Not actually translating the UTF-8 string as per RFC 2279.
537 * For now, just validate that the UTF-8 string off the wire
538 * does not have characters that will freak out UFS, and leave
539 * it at that.
540 */
541 char *
utf8_to_fn(utf8string * u8s,uint_t * lenp,char * s)542 utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
543 {
544 ASSERT(lenp != NULL);
545
546 if (u8s == NULL || u8s->utf8string_len <= 0 ||
547 u8s->utf8string_val == NULL)
548 return (NULL);
549
550 /*
551 * Check for obvious illegal filename chars
552 */
553 if (utf8_strchr(u8s, '/') != NULL) {
554 #ifdef DEBUG
555 if (nfs4_utf8_debug) {
556 char *path;
557 int len = u8s->utf8string_len;
558
559 path = kmem_alloc(len + 1, KM_SLEEP);
560 bcopy(u8s->utf8string_val, path, len);
561 path[len] = '\0';
562
563 zcmn_err(getzoneid(), CE_WARN,
564 "Invalid UTF-8 filename: %s", path);
565
566 kmem_free(path, len + 1);
567 }
568 #endif
569 return (NULL);
570 }
571
572 return (utf8_to_str(u8s, lenp, s));
573 }
574
575 /*
576 * Converts a utf8 string to a C string.
577 * kmem_allocs a new string if not supplied
578 */
579 char *
utf8_to_str(utf8string * str,uint_t * lenp,char * s)580 utf8_to_str(utf8string *str, uint_t *lenp, char *s)
581 {
582 char *sp;
583 char *u8p;
584 int len;
585 int i;
586
587 ASSERT(lenp != NULL);
588
589 if (str == NULL)
590 return (NULL);
591
592 u8p = str->utf8string_val;
593 len = str->utf8string_len;
594 if (len <= 0 || u8p == NULL) {
595 if (s)
596 *s = '\0';
597 return (NULL);
598 }
599
600 sp = s;
601 if (sp == NULL)
602 sp = kmem_alloc(len + 1, KM_SLEEP);
603
604 /*
605 * At least check for embedded nulls
606 */
607 for (i = 0; i < len; i++) {
608 sp[i] = u8p[i];
609 if (u8p[i] == '\0') {
610 #ifdef DEBUG
611 zcmn_err(getzoneid(), CE_WARN,
612 "Embedded NULL in UTF-8 string");
613 #endif
614 if (s == NULL)
615 kmem_free(sp, len + 1);
616 return (NULL);
617 }
618 }
619 sp[len] = '\0';
620 *lenp = len + 1;
621
622 return (sp);
623 }
624
625 /*
626 * str_to_utf8 - converts a null-terminated C string to a utf8 string
627 */
628 utf8string *
str_to_utf8(char * nm,utf8string * str)629 str_to_utf8(char *nm, utf8string *str)
630 {
631 int len;
632
633 if (str == NULL)
634 return (NULL);
635
636 if (nm == NULL || *nm == '\0') {
637 str->utf8string_len = 0;
638 str->utf8string_val = NULL;
639 }
640
641 len = strlen(nm);
642
643 str->utf8string_val = kmem_alloc(len, KM_SLEEP);
644 str->utf8string_len = len;
645 bcopy(nm, str->utf8string_val, len);
646
647 return (str);
648 }
649
650 utf8string *
utf8_copy(utf8string * src,utf8string * dest)651 utf8_copy(utf8string *src, utf8string *dest)
652 {
653 if (src == NULL)
654 return (NULL);
655 if (dest == NULL)
656 return (NULL);
657
658 if (src->utf8string_len > 0) {
659 dest->utf8string_val = kmem_alloc(src->utf8string_len,
660 KM_SLEEP);
661 bcopy(src->utf8string_val, dest->utf8string_val,
662 src->utf8string_len);
663 dest->utf8string_len = src->utf8string_len;
664 } else {
665 dest->utf8string_val = NULL;
666 dest->utf8string_len = 0;
667 }
668
669 return (dest);
670 }
671
672 int
utf8_compare(const utf8string * a,const utf8string * b)673 utf8_compare(const utf8string *a, const utf8string *b)
674 {
675 int mlen, cmp;
676 int alen, blen;
677 char *aval, *bval;
678
679 if ((a == NULL) && (b == NULL))
680 return (0);
681 else if (a == NULL)
682 return (-1);
683 else if (b == NULL)
684 return (1);
685
686 alen = a->utf8string_len;
687 blen = b->utf8string_len;
688 aval = a->utf8string_val;
689 bval = b->utf8string_val;
690
691 if (((alen == 0) || (aval == NULL)) &&
692 ((blen == 0) || (bval == NULL)))
693 return (0);
694 else if ((alen == 0) || (aval == NULL))
695 return (-1);
696 else if ((blen == 0) || (bval == NULL))
697 return (1);
698
699 mlen = MIN(alen, blen);
700 cmp = strncmp(aval, bval, mlen);
701
702 if ((cmp == 0) && (alen == blen))
703 return (0);
704 else if ((cmp == 0) && (alen < blen))
705 return (-1);
706 else if (cmp == 0)
707 return (1);
708 else if (cmp < 0)
709 return (-1);
710 return (1);
711 }
712
713 /*
714 * utf8_dir_verify - checks that the utf8 string is valid
715 */
716 int
utf8_dir_verify(utf8string * str)717 utf8_dir_verify(utf8string *str)
718 {
719 char *nm;
720 int len;
721
722 if (str == NULL)
723 return (0);
724
725 nm = str->utf8string_val;
726 len = str->utf8string_len;
727 if (nm == NULL || len == 0) {
728 return (0);
729 }
730
731 if (len == 1 && nm[0] == '.')
732 return (0);
733 if (len == 2 && nm[0] == '.' && nm[1] == '.')
734 return (0);
735
736 if (utf8_strchr(str, '/') != NULL)
737 return (0);
738
739 if (utf8_strchr(str, '\0') != NULL)
740 return (0);
741
742 return (1);
743 }
744
745 /*
746 * from rpcsec module (common/rpcsec)
747 */
748 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
749 extern void sec_clnt_freeh(AUTH *);
750 extern void sec_clnt_freeinfo(struct sec_data *);
751
752 /*
753 * authget() gets an auth handle based on the security
754 * information from the servinfo in mountinfo.
755 * The auth handle is stored in ch_client->cl_auth.
756 *
757 * First security flavor of choice is to use sv_secdata
758 * which is initiated by the client. If that fails, get
759 * secinfo from the server and then select one from the
760 * server secinfo list .
761 *
762 * For RPCSEC_GSS flavor, upon success, a secure context is
763 * established between client and server.
764 */
765 int
authget(servinfo4_t * svp,CLIENT * ch_client,cred_t * cr)766 authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
767 {
768 int error, i;
769
770 /*
771 * SV4_TRYSECINFO indicates to try the secinfo list from
772 * sv_secinfo until a successful one is reached. Point
773 * sv_currsec to the selected security mechanism for
774 * later sessions.
775 */
776 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
777 if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
778 for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
779 i++) {
780 if (!(error = sec_clnt_geth(ch_client,
781 &svp->sv_secinfo->sdata[i],
782 cr, &ch_client->cl_auth))) {
783
784 svp->sv_currsec = &svp->sv_secinfo->sdata[i];
785 svp->sv_secinfo->index = i;
786 /* done */
787 svp->sv_flags &= ~SV4_TRYSECINFO;
788 break;
789 }
790
791 /*
792 * Allow the caller retry with the security flavor
793 * pointed by svp->sv_secinfo->index when
794 * ETIMEDOUT/ECONNRESET occurs.
795 */
796 if (error == ETIMEDOUT || error == ECONNRESET) {
797 svp->sv_secinfo->index = i;
798 break;
799 }
800 }
801 } else {
802 /* sv_currsec points to one of the entries in sv_secinfo */
803 if (svp->sv_currsec) {
804 error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
805 &ch_client->cl_auth);
806 } else {
807 /* If it's null, use sv_secdata. */
808 error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
809 &ch_client->cl_auth);
810 }
811 }
812 nfs_rw_exit(&svp->sv_lock);
813
814 return (error);
815 }
816
817 /*
818 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
819 */
820 int
clget4(clinfo_t * ci,servinfo4_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs4_clnt * nfscl)821 clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
822 struct chtab **chp, struct nfs4_clnt *nfscl)
823 {
824 struct chhead *ch, *newch;
825 struct chhead **plistp;
826 struct chtab *cp;
827 int error;
828 k_sigset_t smask;
829
830 if (newcl == NULL || chp == NULL || ci == NULL)
831 return (EINVAL);
832
833 *newcl = NULL;
834 *chp = NULL;
835
836 /*
837 * Find an unused handle or create one
838 */
839 newch = NULL;
840 nfscl->nfscl_stat.clgets.value.ui64++;
841 top:
842 /*
843 * Find the correct entry in the cache to check for free
844 * client handles. The search is based on the RPC program
845 * number, program version number, dev_t for the transport
846 * device, and the protocol family.
847 */
848 mutex_enter(&nfscl->nfscl_chtable4_lock);
849 plistp = &nfscl->nfscl_chtable4;
850 for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
851 if (ch->ch_prog == ci->cl_prog &&
852 ch->ch_vers == ci->cl_vers &&
853 ch->ch_dev == svp->sv_knconf->knc_rdev &&
854 (strcmp(ch->ch_protofmly,
855 svp->sv_knconf->knc_protofmly) == 0))
856 break;
857 plistp = &ch->ch_next;
858 }
859
860 /*
861 * If we didn't find a cache entry for this quadruple, then
862 * create one. If we don't have one already preallocated,
863 * then drop the cache lock, create one, and then start over.
864 * If we did have a preallocated entry, then just add it to
865 * the front of the list.
866 */
867 if (ch == NULL) {
868 if (newch == NULL) {
869 mutex_exit(&nfscl->nfscl_chtable4_lock);
870 newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
871 newch->ch_timesused = 0;
872 newch->ch_prog = ci->cl_prog;
873 newch->ch_vers = ci->cl_vers;
874 newch->ch_dev = svp->sv_knconf->knc_rdev;
875 newch->ch_protofmly = kmem_alloc(
876 strlen(svp->sv_knconf->knc_protofmly) + 1,
877 KM_SLEEP);
878 (void) strcpy(newch->ch_protofmly,
879 svp->sv_knconf->knc_protofmly);
880 newch->ch_list = NULL;
881 goto top;
882 }
883 ch = newch;
884 newch = NULL;
885 ch->ch_next = nfscl->nfscl_chtable4;
886 nfscl->nfscl_chtable4 = ch;
887 /*
888 * We found a cache entry, but if it isn't on the front of the
889 * list, then move it to the front of the list to try to take
890 * advantage of locality of operations.
891 */
892 } else if (ch != nfscl->nfscl_chtable4) {
893 *plistp = ch->ch_next;
894 ch->ch_next = nfscl->nfscl_chtable4;
895 nfscl->nfscl_chtable4 = ch;
896 }
897
898 /*
899 * If there was a free client handle cached, then remove it
900 * from the list, init it, and use it.
901 */
902 if (ch->ch_list != NULL) {
903 cp = ch->ch_list;
904 ch->ch_list = cp->ch_list;
905 mutex_exit(&nfscl->nfscl_chtable4_lock);
906 if (newch != NULL) {
907 kmem_free(newch->ch_protofmly,
908 strlen(newch->ch_protofmly) + 1);
909 kmem_free(newch, sizeof (*newch));
910 }
911 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
912 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
913
914 /*
915 * Get an auth handle.
916 */
917 error = authget(svp, cp->ch_client, cr);
918 if (error || cp->ch_client->cl_auth == NULL) {
919 CLNT_DESTROY(cp->ch_client);
920 kmem_cache_free(chtab4_cache, cp);
921 return ((error != 0) ? error : EINTR);
922 }
923 ch->ch_timesused++;
924 *newcl = cp->ch_client;
925 *chp = cp;
926 return (0);
927 }
928
929 /*
930 * There weren't any free client handles which fit, so allocate
931 * a new one and use that.
932 */
933 #ifdef DEBUG
934 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
935 #endif
936 mutex_exit(&nfscl->nfscl_chtable4_lock);
937
938 nfscl->nfscl_stat.cltoomany.value.ui64++;
939 if (newch != NULL) {
940 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
941 kmem_free(newch, sizeof (*newch));
942 }
943
944 cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
945 cp->ch_head = ch;
946
947 sigintr(&smask, (int)ci->cl_flags & MI4_INT);
948 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
949 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
950 sigunintr(&smask);
951
952 if (error != 0) {
953 kmem_cache_free(chtab4_cache, cp);
954 #ifdef DEBUG
955 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
956 #endif
957 /*
958 * Warning is unnecessary if error is EINTR.
959 */
960 if (error != EINTR) {
961 nfs_cmn_err(error, CE_WARN,
962 "clget: couldn't create handle: %m\n");
963 }
964 return (error);
965 }
966 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
967 auth_destroy(cp->ch_client->cl_auth);
968
969 /*
970 * Get an auth handle.
971 */
972 error = authget(svp, cp->ch_client, cr);
973 if (error || cp->ch_client->cl_auth == NULL) {
974 CLNT_DESTROY(cp->ch_client);
975 kmem_cache_free(chtab4_cache, cp);
976 #ifdef DEBUG
977 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
978 #endif
979 return ((error != 0) ? error : EINTR);
980 }
981 ch->ch_timesused++;
982 *newcl = cp->ch_client;
983 ASSERT(cp->ch_client->cl_nosignal == FALSE);
984 *chp = cp;
985 return (0);
986 }
987
988 static int
nfs_clget4(mntinfo4_t * mi,servinfo4_t * svp,cred_t * cr,CLIENT ** newcl,struct chtab ** chp,struct nfs4_clnt * nfscl)989 nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
990 struct chtab **chp, struct nfs4_clnt *nfscl)
991 {
992 clinfo_t ci;
993 bool_t is_recov;
994 int firstcall, error = 0;
995
996 /*
997 * Set read buffer size to rsize
998 * and add room for RPC headers.
999 */
1000 ci.cl_readsize = mi->mi_tsize;
1001 if (ci.cl_readsize != 0)
1002 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
1003
1004 /*
1005 * If soft mount and server is down just try once.
1006 * meaning: do not retransmit.
1007 */
1008 if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
1009 ci.cl_retrans = 0;
1010 else
1011 ci.cl_retrans = mi->mi_retrans;
1012
1013 ci.cl_prog = mi->mi_prog;
1014 ci.cl_vers = mi->mi_vers;
1015 ci.cl_flags = mi->mi_flags;
1016
1017 /*
1018 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
1019 * security flavor, the client tries to establish a security context
1020 * by contacting the server. If the connection is timed out or reset,
1021 * e.g. server reboot, we will try again.
1022 */
1023 is_recov = (curthread == mi->mi_recovthread);
1024 firstcall = 1;
1025
1026 do {
1027 error = clget4(&ci, svp, cr, newcl, chp, nfscl);
1028
1029 if (error == 0)
1030 break;
1031
1032 /*
1033 * For forced unmount and zone shutdown, bail out but
1034 * let the recovery thread do one more transmission.
1035 */
1036 if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
1037 (!is_recov || !firstcall)) {
1038 error = EIO;
1039 break;
1040 }
1041
1042 /* do not retry for soft mount */
1043 if (!(mi->mi_flags & MI4_HARD))
1044 break;
1045
1046 /* let the caller deal with the failover case */
1047 if (FAILOVER_MOUNT4(mi))
1048 break;
1049
1050 firstcall = 0;
1051
1052 } while (error == ETIMEDOUT || error == ECONNRESET);
1053
1054 return (error);
1055 }
1056
1057 void
clfree4(CLIENT * cl,struct chtab * cp,struct nfs4_clnt * nfscl)1058 clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
1059 {
1060 if (cl->cl_auth != NULL) {
1061 sec_clnt_freeh(cl->cl_auth);
1062 cl->cl_auth = NULL;
1063 }
1064
1065 /*
1066 * Timestamp this cache entry so that we know when it was last
1067 * used.
1068 */
1069 cp->ch_freed = gethrestime_sec();
1070
1071 /*
1072 * Add the free client handle to the front of the list.
1073 * This way, the list will be sorted in youngest to oldest
1074 * order.
1075 */
1076 mutex_enter(&nfscl->nfscl_chtable4_lock);
1077 cp->ch_list = cp->ch_head->ch_list;
1078 cp->ch_head->ch_list = cp;
1079 mutex_exit(&nfscl->nfscl_chtable4_lock);
1080 }
1081
1082 #define CL_HOLDTIME 60 /* time to hold client handles */
1083
1084 static void
clreclaim4_zone(struct nfs4_clnt * nfscl,uint_t cl_holdtime)1085 clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
1086 {
1087 struct chhead *ch;
1088 struct chtab *cp; /* list of objects that can be reclaimed */
1089 struct chtab *cpe;
1090 struct chtab *cpl;
1091 struct chtab **cpp;
1092 #ifdef DEBUG
1093 int n = 0;
1094 clstat4_debug.clreclaim.value.ui64++;
1095 #endif
1096
1097 /*
1098 * Need to reclaim some memory, so step through the cache
1099 * looking through the lists for entries which can be freed.
1100 */
1101 cp = NULL;
1102
1103 mutex_enter(&nfscl->nfscl_chtable4_lock);
1104
1105 /*
1106 * Here we step through each non-NULL quadruple and start to
1107 * construct the reclaim list pointed to by cp. Note that
1108 * cp will contain all eligible chtab entries. When this traversal
1109 * completes, chtab entries from the last quadruple will be at the
1110 * front of cp and entries from previously inspected quadruples have
1111 * been appended to the rear of cp.
1112 */
1113 for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
1114 if (ch->ch_list == NULL)
1115 continue;
1116 /*
1117 * Search each list for entries older then
1118 * cl_holdtime seconds. The lists are maintained
1119 * in youngest to oldest order so that when the
1120 * first entry is found which is old enough, then
1121 * all of the rest of the entries on the list will
1122 * be old enough as well.
1123 */
1124 cpl = ch->ch_list;
1125 cpp = &ch->ch_list;
1126 while (cpl != NULL &&
1127 cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
1128 cpp = &cpl->ch_list;
1129 cpl = cpl->ch_list;
1130 }
1131 if (cpl != NULL) {
1132 *cpp = NULL;
1133 if (cp != NULL) {
1134 cpe = cpl;
1135 while (cpe->ch_list != NULL)
1136 cpe = cpe->ch_list;
1137 cpe->ch_list = cp;
1138 }
1139 cp = cpl;
1140 }
1141 }
1142
1143 mutex_exit(&nfscl->nfscl_chtable4_lock);
1144
1145 /*
1146 * If cp is empty, then there is nothing to reclaim here.
1147 */
1148 if (cp == NULL)
1149 return;
1150
1151 /*
1152 * Step through the list of entries to free, destroying each client
1153 * handle and kmem_free'ing the memory for each entry.
1154 */
1155 while (cp != NULL) {
1156 #ifdef DEBUG
1157 n++;
1158 #endif
1159 CLNT_DESTROY(cp->ch_client);
1160 cpl = cp->ch_list;
1161 kmem_cache_free(chtab4_cache, cp);
1162 cp = cpl;
1163 }
1164
1165 #ifdef DEBUG
1166 /*
1167 * Update clalloc so that nfsstat shows the current number
1168 * of allocated client handles.
1169 */
1170 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
1171 #endif
1172 }
1173
1174 /* ARGSUSED */
1175 static void
clreclaim4(void * all)1176 clreclaim4(void *all)
1177 {
1178 struct nfs4_clnt *nfscl;
1179
1180 /*
1181 * The system is low on memory; go through and try to reclaim some from
1182 * every zone on the system.
1183 */
1184 mutex_enter(&nfs4_clnt_list_lock);
1185 nfscl = list_head(&nfs4_clnt_list);
1186 for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
1187 clreclaim4_zone(nfscl, CL_HOLDTIME);
1188 mutex_exit(&nfs4_clnt_list_lock);
1189 }
1190
1191 /*
1192 * Minimum time-out values indexed by call type
1193 * These units are in "eights" of a second to avoid multiplies
1194 */
1195 static unsigned int minimum_timeo[] = {
1196 6, 7, 10
1197 };
1198
1199 #define SHORTWAIT (NFS_COTS_TIMEO / 10)
1200
1201 /*
1202 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
1203 */
1204 #define MAXTIMO (20*hz)
1205 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
1206 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
1207
1208 static int
nfs4_rfscall(mntinfo4_t * mi,rpcproc_t which,xdrproc_t xdrargs,caddr_t argsp,xdrproc_t xdrres,caddr_t resp,cred_t * icr,int * doqueue,enum clnt_stat * rpc_statusp,int flags,struct nfs4_clnt * nfscl)1209 nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1210 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
1211 enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
1212 {
1213 CLIENT *client;
1214 struct chtab *ch;
1215 cred_t *cr = icr;
1216 struct rpc_err rpcerr, rpcerr_tmp;
1217 enum clnt_stat status;
1218 int error;
1219 struct timeval wait;
1220 int timeo; /* in units of hz */
1221 bool_t tryagain, is_recov;
1222 bool_t cred_cloned = FALSE;
1223 k_sigset_t smask;
1224 servinfo4_t *svp;
1225 #ifdef DEBUG
1226 char *bufp;
1227 #endif
1228 int firstcall;
1229
1230 rpcerr.re_status = RPC_SUCCESS;
1231
1232 /*
1233 * If we know that we are rebooting then let's
1234 * not bother with doing any over the wireness.
1235 */
1236 mutex_enter(&mi->mi_lock);
1237 if (mi->mi_flags & MI4_SHUTDOWN) {
1238 mutex_exit(&mi->mi_lock);
1239 return (EIO);
1240 }
1241 mutex_exit(&mi->mi_lock);
1242
1243 /* For TSOL, use a new cred which has net_mac_aware flag */
1244 if (!cred_cloned && is_system_labeled()) {
1245 cred_cloned = TRUE;
1246 cr = crdup(icr);
1247 (void) setpflags(NET_MAC_AWARE, 1, cr);
1248 }
1249
1250 /*
1251 * clget() calls clnt_tli_kinit() which clears the xid, so we
1252 * are guaranteed to reprocess the retry as a new request.
1253 */
1254 svp = mi->mi_curr_serv;
1255 rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
1256 if (rpcerr.re_errno != 0)
1257 return (rpcerr.re_errno);
1258
1259 timeo = (mi->mi_timeo * hz) / 10;
1260
1261 /*
1262 * If hard mounted fs, retry call forever unless hard error
1263 * occurs.
1264 *
1265 * For forced unmount, let the recovery thread through but return
1266 * an error for all others. This is so that user processes can
1267 * exit quickly. The recovery thread bails out after one
1268 * transmission so that it can tell if it needs to continue.
1269 *
1270 * For zone shutdown, behave as above to encourage quick
1271 * process exit, but also fail quickly when servers have
1272 * timed out before and reduce the timeouts.
1273 */
1274 is_recov = (curthread == mi->mi_recovthread);
1275 firstcall = 1;
1276 do {
1277 tryagain = FALSE;
1278
1279 NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
1280 "nfs4_rfscall: vfs_flag=0x%x, %s",
1281 mi->mi_vfsp->vfs_flag,
1282 is_recov ? "recov thread" : "not recov thread"));
1283
1284 /*
1285 * It's possible while we're retrying the admin
1286 * decided to reboot.
1287 */
1288 mutex_enter(&mi->mi_lock);
1289 if (mi->mi_flags & MI4_SHUTDOWN) {
1290 mutex_exit(&mi->mi_lock);
1291 clfree4(client, ch, nfscl);
1292 if (cred_cloned)
1293 crfree(cr);
1294 return (EIO);
1295 }
1296 mutex_exit(&mi->mi_lock);
1297
1298 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1299 (!is_recov || !firstcall)) {
1300 clfree4(client, ch, nfscl);
1301 if (cred_cloned)
1302 crfree(cr);
1303 return (EIO);
1304 }
1305
1306 if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
1307 mutex_enter(&mi->mi_lock);
1308 if ((mi->mi_flags & MI4_TIMEDOUT) ||
1309 !is_recov || !firstcall) {
1310 mutex_exit(&mi->mi_lock);
1311 clfree4(client, ch, nfscl);
1312 if (cred_cloned)
1313 crfree(cr);
1314 return (EIO);
1315 }
1316 mutex_exit(&mi->mi_lock);
1317 timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
1318 }
1319
1320 firstcall = 0;
1321 TICK_TO_TIMEVAL(timeo, &wait);
1322
1323 /*
1324 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1325 * and SIGTERM. (Preserving the existing masks).
1326 * Mask out SIGINT if mount option nointr is specified.
1327 */
1328 sigintr(&smask, (int)mi->mi_flags & MI4_INT);
1329 if (!(mi->mi_flags & MI4_INT))
1330 client->cl_nosignal = TRUE;
1331
1332 /*
1333 * If there is a current signal, then don't bother
1334 * even trying to send out the request because we
1335 * won't be able to block waiting for the response.
1336 * Simply assume RPC_INTR and get on with it.
1337 */
1338 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1339 status = RPC_INTR;
1340 else {
1341 status = CLNT_CALL(client, which, xdrargs, argsp,
1342 xdrres, resp, wait);
1343 }
1344
1345 if (!(mi->mi_flags & MI4_INT))
1346 client->cl_nosignal = FALSE;
1347 /*
1348 * restore original signal mask
1349 */
1350 sigunintr(&smask);
1351
1352 switch (status) {
1353 case RPC_SUCCESS:
1354 break;
1355
1356 case RPC_INTR:
1357 /*
1358 * There is no way to recover from this error,
1359 * even if mount option nointr is specified.
1360 * SIGKILL, for example, cannot be blocked.
1361 */
1362 rpcerr.re_status = RPC_INTR;
1363 rpcerr.re_errno = EINTR;
1364 break;
1365
1366 case RPC_UDERROR:
1367 /*
1368 * If the NFS server is local (vold) and
1369 * it goes away then we get RPC_UDERROR.
1370 * This is a retryable error, so we would
1371 * loop, so check to see if the specific
1372 * error was ECONNRESET, indicating that
1373 * target did not exist at all. If so,
1374 * return with RPC_PROGUNAVAIL and
1375 * ECONNRESET to indicate why.
1376 */
1377 CLNT_GETERR(client, &rpcerr);
1378 if (rpcerr.re_errno == ECONNRESET) {
1379 rpcerr.re_status = RPC_PROGUNAVAIL;
1380 rpcerr.re_errno = ECONNRESET;
1381 break;
1382 }
1383 /*FALLTHROUGH*/
1384
1385 default: /* probably RPC_TIMEDOUT */
1386
1387 if (IS_UNRECOVERABLE_RPC(status))
1388 break;
1389
1390 /*
1391 * increment server not responding count
1392 */
1393 mutex_enter(&mi->mi_lock);
1394 mi->mi_noresponse++;
1395 mutex_exit(&mi->mi_lock);
1396 #ifdef DEBUG
1397 nfscl->nfscl_stat.noresponse.value.ui64++;
1398 #endif
1399 /*
1400 * On zone shutdown, mark server dead and move on.
1401 */
1402 if (zone_status_get(curproc->p_zone) >=
1403 ZONE_IS_SHUTTING_DOWN) {
1404 mutex_enter(&mi->mi_lock);
1405 mi->mi_flags |= MI4_TIMEDOUT;
1406 mutex_exit(&mi->mi_lock);
1407 clfree4(client, ch, nfscl);
1408 if (cred_cloned)
1409 crfree(cr);
1410 return (EIO);
1411 }
1412
1413 /*
1414 * NFS client failover support:
1415 * return and let the caller take care of
1416 * failover. We only return for failover mounts
1417 * because otherwise we want the "not responding"
1418 * message, the timer updates, etc.
1419 */
1420 if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
1421 (error = try_failover(status)) != 0) {
1422 clfree4(client, ch, nfscl);
1423 if (cred_cloned)
1424 crfree(cr);
1425 *rpc_statusp = status;
1426 return (error);
1427 }
1428
1429 if (flags & RFSCALL_SOFT)
1430 break;
1431
1432 tryagain = TRUE;
1433
1434 /*
1435 * The call is in progress (over COTS).
1436 * Try the CLNT_CALL again, but don't
1437 * print a noisy error message.
1438 */
1439 if (status == RPC_INPROGRESS)
1440 break;
1441
1442 timeo = backoff(timeo);
1443 CLNT_GETERR(client, &rpcerr_tmp);
1444
1445 mutex_enter(&mi->mi_lock);
1446 if (!(mi->mi_flags & MI4_PRINTED)) {
1447 mi->mi_flags |= MI4_PRINTED;
1448 mutex_exit(&mi->mi_lock);
1449 if ((status == RPC_CANTSEND) &&
1450 (rpcerr_tmp.re_errno == ENOBUFS))
1451 nfs4_queue_fact(RF_SENDQ_FULL, mi, 0,
1452 0, 0, FALSE, NULL, 0, NULL);
1453 else
1454 nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
1455 0, 0, 0, FALSE, NULL, 0, NULL);
1456 } else
1457 mutex_exit(&mi->mi_lock);
1458
1459 if (*doqueue && nfs_has_ctty()) {
1460 *doqueue = 0;
1461 if (!(mi->mi_flags & MI4_NOPRINT)) {
1462 if ((status == RPC_CANTSEND) &&
1463 (rpcerr_tmp.re_errno == ENOBUFS))
1464 nfs4_queue_fact(RF_SENDQ_FULL,
1465 mi, 0, 0, 0, FALSE, NULL,
1466 0, NULL);
1467 else
1468 nfs4_queue_fact(
1469 RF_SRV_NOT_RESPOND, mi, 0,
1470 0, 0, FALSE, NULL, 0, NULL);
1471 }
1472 }
1473 }
1474 } while (tryagain);
1475
1476 DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
1477 int, rpcerr.re_errno);
1478
1479 if (status != RPC_SUCCESS) {
1480 zoneid_t zoneid = mi->mi_zone->zone_id;
1481
1482 /*
1483 * Let soft mounts use the timed out message.
1484 */
1485 if (status == RPC_INPROGRESS)
1486 status = RPC_TIMEDOUT;
1487 nfscl->nfscl_stat.badcalls.value.ui64++;
1488 if (status != RPC_INTR) {
1489 mutex_enter(&mi->mi_lock);
1490 mi->mi_flags |= MI4_DOWN;
1491 mutex_exit(&mi->mi_lock);
1492 CLNT_GETERR(client, &rpcerr);
1493 #ifdef DEBUG
1494 bufp = clnt_sperror(client, svp->sv_hostname);
1495 zprintf(zoneid, "NFS%d %s failed for %s\n",
1496 mi->mi_vers, mi->mi_rfsnames[which], bufp);
1497 if (nfs_has_ctty()) {
1498 if (!(mi->mi_flags & MI4_NOPRINT)) {
1499 uprintf("NFS%d %s failed for %s\n",
1500 mi->mi_vers, mi->mi_rfsnames[which],
1501 bufp);
1502 }
1503 }
1504 kmem_free(bufp, MAXPATHLEN);
1505 #else
1506 zprintf(zoneid,
1507 "NFS %s failed for server %s: error %d (%s)\n",
1508 mi->mi_rfsnames[which], svp->sv_hostname,
1509 status, clnt_sperrno(status));
1510 if (nfs_has_ctty()) {
1511 if (!(mi->mi_flags & MI4_NOPRINT)) {
1512 uprintf(
1513 "NFS %s failed for server %s: error %d (%s)\n",
1514 mi->mi_rfsnames[which],
1515 svp->sv_hostname, status,
1516 clnt_sperrno(status));
1517 }
1518 }
1519 #endif
1520 /*
1521 * when CLNT_CALL() fails with RPC_AUTHERROR,
1522 * re_errno is set appropriately depending on
1523 * the authentication error
1524 */
1525 if (status == RPC_VERSMISMATCH ||
1526 status == RPC_PROGVERSMISMATCH)
1527 rpcerr.re_errno = EIO;
1528 }
1529 } else {
1530 /*
1531 * Test the value of mi_down and mi_printed without
1532 * holding the mi_lock mutex. If they are both zero,
1533 * then it is okay to skip the down and printed
1534 * processing. This saves on a mutex_enter and
1535 * mutex_exit pair for a normal, successful RPC.
1536 * This was just complete overhead.
1537 */
1538 if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
1539 mutex_enter(&mi->mi_lock);
1540 mi->mi_flags &= ~MI4_DOWN;
1541 if (mi->mi_flags & MI4_PRINTED) {
1542 mi->mi_flags &= ~MI4_PRINTED;
1543 mutex_exit(&mi->mi_lock);
1544 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1545 nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
1546 0, FALSE, NULL, 0, NULL);
1547 } else
1548 mutex_exit(&mi->mi_lock);
1549 }
1550
1551 if (*doqueue == 0) {
1552 if (!(mi->mi_flags & MI4_NOPRINT) &&
1553 !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1554 nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
1555 FALSE, NULL, 0, NULL);
1556
1557 *doqueue = 1;
1558 }
1559 }
1560
1561 clfree4(client, ch, nfscl);
1562 if (cred_cloned)
1563 crfree(cr);
1564
1565 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1566
1567 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
1568 rpcerr.re_errno);
1569
1570 *rpc_statusp = status;
1571 return (rpcerr.re_errno);
1572 }
1573
1574 /*
1575 * rfs4call - general wrapper for RPC calls initiated by the client
1576 */
1577 void
rfs4call(mntinfo4_t * mi,COMPOUND4args_clnt * argsp,COMPOUND4res_clnt * resp,cred_t * cr,int * doqueue,int flags,nfs4_error_t * ep)1578 rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
1579 cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
1580 {
1581 int i, error;
1582 enum clnt_stat rpc_status = NFS4_OK;
1583 int num_resops;
1584 struct nfs4_clnt *nfscl;
1585
1586 ASSERT(nfs_zone() == mi->mi_zone);
1587 nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1588 ASSERT(nfscl != NULL);
1589
1590 nfscl->nfscl_stat.calls.value.ui64++;
1591 mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;
1592
1593 /* Set up the results struct for XDR usage */
1594 resp->argsp = argsp;
1595 resp->array = NULL;
1596 resp->status = 0;
1597 resp->decode_len = 0;
1598
1599 error = nfs4_rfscall(mi, NFSPROC4_COMPOUND,
1600 xdr_COMPOUND4args_clnt, (caddr_t)argsp,
1601 xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
1602 doqueue, &rpc_status, flags, nfscl);
1603
1604 /* Return now if it was an RPC error */
1605 if (error) {
1606 ep->error = error;
1607 ep->stat = resp->status;
1608 ep->rpc_status = rpc_status;
1609 return;
1610 }
1611
1612 /* else we'll count the processed operations */
1613 num_resops = resp->decode_len;
1614 for (i = 0; i < num_resops; i++) {
1615 /*
1616 * Count the individual operations
1617 * processed by the server.
1618 */
1619 if (resp->array[i].resop >= NFSPROC4_NULL &&
1620 resp->array[i].resop <= OP_WRITE)
1621 mi->mi_reqs[resp->array[i].resop].value.ui64++;
1622 }
1623
1624 ep->error = 0;
1625 ep->stat = resp->status;
1626 ep->rpc_status = rpc_status;
1627 }
1628
1629 /*
1630 * nfs4rename_update - updates stored state after a rename. Currently this
1631 * is the path of the object and anything under it, and the filehandle of
1632 * the renamed object.
1633 */
1634 void
nfs4rename_update(vnode_t * renvp,vnode_t * ndvp,nfs_fh4 * nfh4p,char * nnm)1635 nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
1636 {
1637 sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
1638 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
1639 }
1640
1641 /*
1642 * Routine to look up the filehandle for the given path and rootvp.
1643 *
1644 * Return values:
1645 * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
1646 * updated.
1647 * - error: return value (errno value) and/or *statp is set appropriately.
1648 */
1649 #define RML_ORDINARY 1
1650 #define RML_NAMED_ATTR 2
1651 #define RML_ATTRDIR 3
1652
1653 static void
remap_lookup(nfs4_fname_t * fname,vnode_t * rootvp,int filetype,cred_t * cr,nfs_fh4 * fhp,nfs4_ga_res_t * garp,nfs_fh4 * pfhp,nfs4_ga_res_t * pgarp,nfs4_error_t * ep)1654 remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
1655 int filetype, cred_t *cr,
1656 nfs_fh4 *fhp, nfs4_ga_res_t *garp, /* fh, attrs for object */
1657 nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp, /* fh, attrs for parent */
1658 nfs4_error_t *ep)
1659 {
1660 COMPOUND4args_clnt args;
1661 COMPOUND4res_clnt res;
1662 nfs_argop4 *argop;
1663 nfs_resop4 *resop;
1664 int num_argops;
1665 lookup4_param_t lookuparg;
1666 nfs_fh4 *tmpfhp;
1667 int doqueue = 1;
1668 char *path;
1669 mntinfo4_t *mi;
1670
1671 ASSERT(fname != NULL);
1672 ASSERT(rootvp->v_type == VDIR);
1673
1674 mi = VTOMI4(rootvp);
1675 path = fn_path(fname);
1676 switch (filetype) {
1677 case RML_NAMED_ATTR:
1678 lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
1679 args.ctag = TAG_REMAP_LOOKUP_NA;
1680 break;
1681 case RML_ATTRDIR:
1682 lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
1683 args.ctag = TAG_REMAP_LOOKUP_AD;
1684 break;
1685 case RML_ORDINARY:
1686 lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1687 args.ctag = TAG_REMAP_LOOKUP;
1688 break;
1689 default:
1690 ep->error = EINVAL;
1691 return;
1692 }
1693 lookuparg.argsp = &args;
1694 lookuparg.resp = &res;
1695 lookuparg.header_len = 1; /* Putfh */
1696 lookuparg.trailer_len = 0;
1697 lookuparg.ga_bits = NFS4_VATTR_MASK;
1698 lookuparg.mi = VTOMI4(rootvp);
1699
1700 (void) nfs4lookup_setup(path, &lookuparg, 1);
1701
1702 /* 0: putfh directory */
1703 argop = args.array;
1704 argop[0].argop = OP_CPUTFH;
1705 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;
1706
1707 num_argops = args.array_len;
1708
1709 rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1710
1711 if (ep->error || res.status != NFS4_OK)
1712 goto exit;
1713
1714 /* get the object filehandle */
1715 resop = &res.array[res.array_len - 2];
1716 if (resop->resop != OP_GETFH) {
1717 nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1718 0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1719 ep->stat = NFS4ERR_SERVERFAULT;
1720 goto exit;
1721 }
1722 tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1723 if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1724 nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1725 tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1726 TAG_NONE, 0, 0);
1727 ep->stat = NFS4ERR_SERVERFAULT;
1728 goto exit;
1729 }
1730 fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1731 nfs_fh4_copy(tmpfhp, fhp);
1732
1733 /* get the object attributes */
1734 resop = &res.array[res.array_len - 1];
1735 if (garp && resop->resop == OP_GETATTR)
1736 *garp = resop->nfs_resop4_u.opgetattr.ga_res;
1737
1738 /* See if there are enough fields in the response for parent info */
1739 if ((int)res.array_len - 5 <= 0)
1740 goto exit;
1741
1742 /* get the parent filehandle */
1743 resop = &res.array[res.array_len - 5];
1744 if (resop->resop != OP_GETFH) {
1745 nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
1746 0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
1747 ep->stat = NFS4ERR_SERVERFAULT;
1748 goto exit;
1749 }
1750 tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1751 if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
1752 nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
1753 tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
1754 TAG_NONE, 0, 0);
1755 ep->stat = NFS4ERR_SERVERFAULT;
1756 goto exit;
1757 }
1758 pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
1759 nfs_fh4_copy(tmpfhp, pfhp);
1760
1761 /* get the parent attributes */
1762 resop = &res.array[res.array_len - 4];
1763 if (pgarp && resop->resop == OP_GETATTR)
1764 *pgarp = resop->nfs_resop4_u.opgetattr.ga_res;
1765
1766 exit:
1767 /*
1768 * It is too hard to remember where all the OP_LOOKUPs are
1769 */
1770 nfs4args_lookup_free(argop, num_argops);
1771 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1772
1773 if (!ep->error)
1774 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1775 kmem_free(path, strlen(path)+1);
1776 }
1777
1778 /*
1779 * NFS client failover / volatile filehandle support
1780 *
1781 * Recover the filehandle for the given rnode.
1782 *
1783 * Errors are returned via the nfs4_error_t parameter.
1784 */
1785
1786 void
nfs4_remap_file(mntinfo4_t * mi,vnode_t * vp,int flags,nfs4_error_t * ep)1787 nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
1788 {
1789 int is_stub;
1790 rnode4_t *rp = VTOR4(vp);
1791 vnode_t *rootvp = NULL;
1792 vnode_t *dvp = NULL;
1793 cred_t *cr, *cred_otw;
1794 nfs4_ga_res_t gar, pgar;
1795 nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
1796 int filetype = RML_ORDINARY;
1797 nfs4_recov_state_t recov = {NULL, 0, 0};
1798 int badfhcount = 0;
1799 nfs4_open_stream_t *osp = NULL;
1800 bool_t first_time = TRUE; /* first time getting OTW cred */
1801 bool_t last_time = FALSE; /* last time getting OTW cred */
1802
1803 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1804 "nfs4_remap_file: remapping %s", rnode4info(rp)));
1805 ASSERT(nfs4_consistent_type(vp));
1806
1807 if (vp->v_flag & VROOT) {
1808 nfs4_remap_root(mi, ep, flags);
1809 return;
1810 }
1811
1812 /*
1813 * Given the root fh, use the path stored in
1814 * the rnode to find the fh for the new server.
1815 */
1816 ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1817 if (ep->error != 0)
1818 return;
1819
1820 cr = curthread->t_cred;
1821 ASSERT(cr != NULL);
1822 get_remap_cred:
1823 /*
1824 * Releases the osp, if it is provided.
1825 * Puts a hold on the cred_otw and the new osp (if found).
1826 */
1827 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
1828 &first_time, &last_time);
1829 ASSERT(cred_otw != NULL);
1830
1831 if (rp->r_flags & R4ISXATTR) {
1832 filetype = RML_NAMED_ATTR;
1833 (void) vtodv(vp, &dvp, cred_otw, FALSE);
1834 }
1835
1836 if (vp->v_flag & V_XATTRDIR) {
1837 filetype = RML_ATTRDIR;
1838 }
1839
1840 if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
1841 /* file mount, doesn't need a remap */
1842 goto done;
1843 }
1844
1845 again:
1846 remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
1847 &newfh, &gar, &newpfh, &pgar, ep);
1848
1849 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1850 "nfs4_remap_file: remap_lookup returned %d/%d",
1851 ep->error, ep->stat));
1852
1853 if (last_time == FALSE && ep->error == EACCES) {
1854 crfree(cred_otw);
1855 if (dvp != NULL)
1856 VN_RELE(dvp);
1857 goto get_remap_cred;
1858 }
1859 if (ep->error != 0)
1860 goto done;
1861
1862 switch (ep->stat) {
1863 case NFS4_OK:
1864 badfhcount = 0;
1865 if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1866 mutex_enter(&rp->r_statelock);
1867 rp->r_delay_interval = 0;
1868 mutex_exit(&rp->r_statelock);
1869 uprintf("NFS File Available..\n");
1870 }
1871 break;
1872 case NFS4ERR_FHEXPIRED:
1873 case NFS4ERR_BADHANDLE:
1874 case NFS4ERR_STALE:
1875 /*
1876 * If we ran into filehandle problems, we should try to
1877 * remap the root vnode first and hope life gets better.
1878 * But we need to avoid loops.
1879 */
1880 if (badfhcount++ > 0)
1881 goto done;
1882 if (newfh.nfs_fh4_len != 0) {
1883 kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1884 newfh.nfs_fh4_len = 0;
1885 }
1886 if (newpfh.nfs_fh4_len != 0) {
1887 kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1888 newpfh.nfs_fh4_len = 0;
1889 }
1890 /* relative path - remap rootvp then retry */
1891 VN_RELE(rootvp);
1892 rootvp = NULL;
1893 nfs4_remap_root(mi, ep, flags);
1894 if (ep->error != 0 || ep->stat != NFS4_OK)
1895 goto done;
1896 ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
1897 if (ep->error != 0)
1898 goto done;
1899 goto again;
1900 case NFS4ERR_DELAY:
1901 badfhcount = 0;
1902 nfs4_set_delay_wait(vp);
1903 ep->error = nfs4_wait_for_delay(vp, &recov);
1904 if (ep->error != 0)
1905 goto done;
1906 goto again;
1907 case NFS4ERR_ACCESS:
1908 /* get new cred, try again */
1909 if (last_time == TRUE)
1910 goto done;
1911 if (dvp != NULL)
1912 VN_RELE(dvp);
1913 crfree(cred_otw);
1914 goto get_remap_cred;
1915 default:
1916 goto done;
1917 }
1918
1919 /*
1920 * Check on the new and old rnodes before updating;
1921 * if the vnode type or size changes, issue a warning
1922 * and mark the file dead.
1923 */
1924 mutex_enter(&rp->r_statelock);
1925 if (flags & NFS4_REMAP_CKATTRS) {
1926 if (vp->v_type != gar.n4g_va.va_type ||
1927 (vp->v_type != VDIR &&
1928 rp->r_size != gar.n4g_va.va_size)) {
1929 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1930 "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
1931 (int)rp->r_size, (int)gar.n4g_va.va_size,
1932 vp->v_type, gar.n4g_va.va_type));
1933 mutex_exit(&rp->r_statelock);
1934 nfs4_queue_event(RE_FILE_DIFF, mi,
1935 rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
1936 TAG_NONE, TAG_NONE, 0, 0);
1937 nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
1938 goto done;
1939 }
1940 }
1941 ASSERT(gar.n4g_va.va_type != VNON);
1942 rp->r_server = mi->mi_curr_serv;
1943
1944 /*
1945 * Turn this object into a "stub" object if we
1946 * crossed an underlying server fs boundary.
1947 *
1948 * This stub will be for a mirror-mount.
1949 * A referral would look like a boundary crossing
1950 * as well, but would not be the same type of object,
1951 * so we would expect to mark the object dead.
1952 *
1953 * See comment in r4_do_attrcache() for more details.
1954 */
1955 is_stub = 0;
1956 if (gar.n4g_fsid_valid) {
1957 (void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
1958 rp->r_srv_fsid = gar.n4g_fsid;
1959 if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
1960 is_stub = 1;
1961 nfs_rw_exit(&rp->r_server->sv_lock);
1962 #ifdef DEBUG
1963 } else {
1964 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1965 "remap_file: fsid attr not provided by server. rp=%p",
1966 (void *)rp));
1967 #endif
1968 }
1969 if (is_stub)
1970 r4_stub_mirrormount(rp);
1971 else
1972 r4_stub_none(rp);
1973 mutex_exit(&rp->r_statelock);
1974 nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
1975 sfh4_update(rp->r_fh, &newfh);
1976 ASSERT(nfs4_consistent_type(vp));
1977
1978 /*
1979 * If we got parent info, use it to update the parent
1980 */
1981 if (newpfh.nfs_fh4_len != 0) {
1982 if (rp->r_svnode.sv_dfh != NULL)
1983 sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
1984 if (dvp != NULL) {
1985 /* force update of attrs */
1986 nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
1987 }
1988 }
1989 done:
1990 if (newfh.nfs_fh4_len != 0)
1991 kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
1992 if (newpfh.nfs_fh4_len != 0)
1993 kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
1994 if (cred_otw != NULL)
1995 crfree(cred_otw);
1996 if (rootvp != NULL)
1997 VN_RELE(rootvp);
1998 if (dvp != NULL)
1999 VN_RELE(dvp);
2000 if (osp != NULL)
2001 open_stream_rele(osp, rp);
2002 }
2003
2004 /*
2005 * Client-side failover support: remap the filehandle for vp if it appears
2006 * necessary. errors are returned via the nfs4_error_t parameter; though,
2007 * if there is a problem, we will just try again later.
2008 */
2009
2010 void
nfs4_check_remap(mntinfo4_t * mi,vnode_t * vp,int flags,nfs4_error_t * ep)2011 nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
2012 {
2013 if (vp == NULL)
2014 return;
2015
2016 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
2017 return;
2018
2019 if (VTOR4(vp)->r_server == mi->mi_curr_serv)
2020 return;
2021
2022 nfs4_remap_file(mi, vp, flags, ep);
2023 }
2024
2025 /*
2026 * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
2027 *
2028 * Our caller has a filehandle for ".." relative to a particular
2029 * directory object. We want to find or create a parent vnode
2030 * with that filehandle and return it. We can of course create
2031 * a vnode from this filehandle, but we need to also make sure
2032 * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
2033 * that we have a parent FH for future reopens as well. If
2034 * we have a remap failure, we won't be able to reopen this
2035 * file, but we won't treat that as fatal because a reopen
2036 * is at least unlikely. Someday nfs4_reopen() should look
2037 * for a missing parent FH and try a remap to recover from it.
2038 *
2039 * need_start_op argument indicates whether this function should
2040 * do a start_op before calling remap_lookup(). This should
2041 * be FALSE, if you are the recovery thread or in an op; otherwise,
2042 * set it to TRUE.
2043 */
2044 int
nfs4_make_dotdot(nfs4_sharedfh_t * fhp,hrtime_t t,vnode_t * dvp,cred_t * cr,vnode_t ** vpp,int need_start_op)2045 nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
2046 cred_t *cr, vnode_t **vpp, int need_start_op)
2047 {
2048 mntinfo4_t *mi = VTOMI4(dvp);
2049 nfs4_fname_t *np = NULL, *pnp = NULL;
2050 vnode_t *vp = NULL, *rootvp = NULL;
2051 rnode4_t *rp;
2052 nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
2053 nfs4_ga_res_t gar, pgar;
2054 vattr_t va, pva;
2055 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2056 nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
2057 nfs4_recov_state_t recov_state;
2058
2059 #ifdef DEBUG
2060 /*
2061 * ensure need_start_op is correct
2062 */
2063 {
2064 int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
2065 (curthread == mi->mi_recovthread));
2066 /* C needs a ^^ operator! */
2067 ASSERT(((need_start_op) && (!no_need_start_op)) ||
2068 ((! need_start_op) && (no_need_start_op)));
2069 }
2070 #endif
2071 ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());
2072
2073 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
2074 "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
2075 rnode4info(VTOR4(dvp))));
2076
2077 /*
2078 * rootvp might be needed eventually. Holding it now will
2079 * ensure that r4find_unlocked() will find it, if ".." is the root.
2080 */
2081 e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
2082 if (e.error != 0)
2083 goto out;
2084 rp = r4find_unlocked(fhp, mi->mi_vfsp);
2085 if (rp != NULL) {
2086 *vpp = RTOV4(rp);
2087 VN_RELE(rootvp);
2088 return (0);
2089 }
2090
2091 /*
2092 * Since we don't have the rnode, we have to go over the wire.
2093 * remap_lookup() can get all of the filehandles and attributes
2094 * we need in one operation.
2095 */
2096 np = fn_parent(VTOSV(dvp)->sv_name);
2097 /* if a parent was not found return an error */
2098 if (np == NULL) {
2099 e.error = ENOENT;
2100 goto out;
2101 }
2102
2103 recov_state.rs_flags = 0;
2104 recov_state.rs_num_retry_despite_err = 0;
2105 recov_retry:
2106 if (need_start_op) {
2107 e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
2108 &recov_state, NULL);
2109 if (e.error != 0) {
2110 goto out;
2111 }
2112 }
2113 va.va_type = VNON;
2114 pva.va_type = VNON;
2115 remap_lookup(np, rootvp, RML_ORDINARY, cr,
2116 &newfh, &gar, &newpfh, &pgar, &e);
2117 if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2118 if (need_start_op) {
2119 bool_t abort;
2120
2121 abort = nfs4_start_recovery(&e, mi,
2122 rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL, NULL,
2123 NULL);
2124 if (abort) {
2125 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2126 &recov_state, FALSE);
2127 if (e.error == 0)
2128 e.error = EIO;
2129 goto out;
2130 }
2131 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2132 &recov_state, TRUE);
2133 goto recov_retry;
2134 }
2135 if (e.error == 0)
2136 e.error = EIO;
2137 goto out;
2138 }
2139
2140 if (!e.error) {
2141 va = gar.n4g_va;
2142 pva = pgar.n4g_va;
2143 }
2144
2145 if ((e.error != 0) ||
2146 (va.va_type != VDIR)) {
2147 if (need_start_op)
2148 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2149 &recov_state, FALSE);
2150 if (e.error == 0)
2151 e.error = EIO;
2152 goto out;
2153 }
2154
2155 if (e.stat != NFS4_OK) {
2156 if (need_start_op)
2157 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2158 &recov_state, FALSE);
2159 e.error = EIO;
2160 goto out;
2161 }
2162
2163 /*
2164 * It is possible for remap_lookup() to return with no error,
2165 * but without providing the parent filehandle and attrs.
2166 */
2167 if (pva.va_type != VDIR) {
2168 /*
2169 * Call remap_lookup() again, this time with the
2170 * newpfh and pgar args in the first position.
2171 */
2172 pnp = fn_parent(np);
2173 if (pnp != NULL) {
2174 remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
2175 &newpfh, &pgar, NULL, NULL, &e);
2176 if (nfs4_needs_recovery(&e, FALSE,
2177 mi->mi_vfsp)) {
2178 if (need_start_op) {
2179 bool_t abort;
2180
2181 abort = nfs4_start_recovery(&e, mi,
2182 rootvp, NULL, NULL, NULL,
2183 OP_LOOKUP, NULL, NULL, NULL);
2184 if (abort) {
2185 nfs4_end_fop(mi, rootvp, NULL,
2186 OH_LOOKUP, &recov_state,
2187 FALSE);
2188 if (e.error == 0)
2189 e.error = EIO;
2190 goto out;
2191 }
2192 nfs4_end_fop(mi, rootvp, NULL,
2193 OH_LOOKUP, &recov_state, TRUE);
2194 goto recov_retry;
2195 }
2196 if (e.error == 0)
2197 e.error = EIO;
2198 goto out;
2199 }
2200
2201 if (e.stat != NFS4_OK) {
2202 if (need_start_op)
2203 nfs4_end_fop(mi, rootvp, NULL,
2204 OH_LOOKUP, &recov_state, FALSE);
2205 e.error = EIO;
2206 goto out;
2207 }
2208 }
2209 if ((pnp == NULL) ||
2210 (e.error != 0) ||
2211 (pva.va_type == VNON)) {
2212 if (need_start_op)
2213 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2214 &recov_state, FALSE);
2215 if (e.error == 0)
2216 e.error = EIO;
2217 goto out;
2218 }
2219 }
2220 ASSERT(newpfh.nfs_fh4_len != 0);
2221 if (need_start_op)
2222 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
2223 psfh = sfh4_get(&newpfh, mi);
2224
2225 sfh = sfh4_get(&newfh, mi);
2226 vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);
2227
2228 out:
2229 if (np != NULL)
2230 fn_rele(&np);
2231 if (pnp != NULL)
2232 fn_rele(&pnp);
2233 if (newfh.nfs_fh4_len != 0)
2234 kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2235 if (newpfh.nfs_fh4_len != 0)
2236 kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2237 if (sfh != NULL)
2238 sfh4_rele(&sfh);
2239 if (psfh != NULL)
2240 sfh4_rele(&psfh);
2241 if (rootvp != NULL)
2242 VN_RELE(rootvp);
2243 *vpp = vp;
2244 return (e.error);
2245 }
2246
2247 #ifdef DEBUG
2248 size_t r_path_memuse = 0;
2249 #endif
2250
2251 /*
2252 * NFS client failover support
2253 *
2254 * sv4_free() frees the malloc'd portion of a "servinfo_t".
2255 */
2256 void
sv4_free(servinfo4_t * svp)2257 sv4_free(servinfo4_t *svp)
2258 {
2259 servinfo4_t *next;
2260 struct knetconfig *knconf;
2261
2262 while (svp != NULL) {
2263 next = svp->sv_next;
2264 if (svp->sv_dhsec)
2265 sec_clnt_freeinfo(svp->sv_dhsec);
2266 if (svp->sv_secdata)
2267 sec_clnt_freeinfo(svp->sv_secdata);
2268 if (svp->sv_save_secinfo &&
2269 svp->sv_save_secinfo != svp->sv_secinfo)
2270 secinfo_free(svp->sv_save_secinfo);
2271 if (svp->sv_secinfo)
2272 secinfo_free(svp->sv_secinfo);
2273 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
2274 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
2275 knconf = svp->sv_knconf;
2276 if (knconf != NULL) {
2277 if (knconf->knc_protofmly != NULL)
2278 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2279 if (knconf->knc_proto != NULL)
2280 kmem_free(knconf->knc_proto, KNC_STRSIZE);
2281 kmem_free(knconf, sizeof (*knconf));
2282 }
2283 knconf = svp->sv_origknconf;
2284 if (knconf != NULL) {
2285 if (knconf->knc_protofmly != NULL)
2286 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2287 if (knconf->knc_proto != NULL)
2288 kmem_free(knconf->knc_proto, KNC_STRSIZE);
2289 kmem_free(knconf, sizeof (*knconf));
2290 }
2291 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
2292 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
2293 if (svp->sv_path != NULL) {
2294 kmem_free(svp->sv_path, svp->sv_pathlen);
2295 }
2296 nfs_rw_destroy(&svp->sv_lock);
2297 kmem_free(svp, sizeof (*svp));
2298 svp = next;
2299 }
2300 }
2301
2302 void
nfs4_printfhandle(nfs4_fhandle_t * fhp)2303 nfs4_printfhandle(nfs4_fhandle_t *fhp)
2304 {
2305 int *ip;
2306 char *buf;
2307 size_t bufsize;
2308 char *cp;
2309
2310 /*
2311 * 13 == "(file handle:"
2312 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2313 * 1 == ' '
2314 * 8 == maximum strlen of "%x"
2315 * 3 == ")\n\0"
2316 */
2317 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2318 buf = kmem_alloc(bufsize, KM_NOSLEEP);
2319 if (buf == NULL)
2320 return;
2321
2322 cp = buf;
2323 (void) strcpy(cp, "(file handle:");
2324 while (*cp != '\0')
2325 cp++;
2326 for (ip = (int *)fhp->fh_buf;
2327 ip < (int *)&fhp->fh_buf[fhp->fh_len];
2328 ip++) {
2329 (void) sprintf(cp, " %x", *ip);
2330 while (*cp != '\0')
2331 cp++;
2332 }
2333 (void) strcpy(cp, ")\n");
2334
2335 zcmn_err(getzoneid(), CE_CONT, "%s", buf);
2336
2337 kmem_free(buf, bufsize);
2338 }
2339
2340 /*
2341 * The NFSv4 readdir cache subsystem.
2342 *
2343 * We provide a set of interfaces to allow the rest of the system to utilize
2344 * a caching mechanism while encapsulating the details of the actual
2345 * implementation. This should allow for better maintainability and
2346 * extensibility by consolidating the implementation details in one location.
2347 */
2348
2349 /*
2350 * Comparator used by AVL routines.
2351 */
2352 static int
rddir4_cache_compar(const void * x,const void * y)2353 rddir4_cache_compar(const void *x, const void *y)
2354 {
2355 rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
2356 rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
2357 rddir4_cache *a = &ai->rc;
2358 rddir4_cache *b = &bi->rc;
2359
2360 if (a->nfs4_cookie == b->nfs4_cookie) {
2361 if (a->buflen == b->buflen)
2362 return (0);
2363 if (a->buflen < b->buflen)
2364 return (-1);
2365 return (1);
2366 }
2367
2368 if (a->nfs4_cookie < b->nfs4_cookie)
2369 return (-1);
2370
2371 return (1);
2372 }
2373
2374 /*
2375 * Allocate an opaque handle for the readdir cache.
2376 */
2377 void
rddir4_cache_create(rnode4_t * rp)2378 rddir4_cache_create(rnode4_t *rp)
2379 {
2380 ASSERT(rp->r_dir == NULL);
2381
2382 rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
2383
2384 avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
2385 offsetof(rddir4_cache_impl, tree));
2386 }
2387
2388 /*
2389 * Purge the cache of all cached readdir responses.
2390 */
2391 void
rddir4_cache_purge(rnode4_t * rp)2392 rddir4_cache_purge(rnode4_t *rp)
2393 {
2394 rddir4_cache_impl *rdip;
2395 rddir4_cache_impl *nrdip;
2396
2397 ASSERT(MUTEX_HELD(&rp->r_statelock));
2398
2399 if (rp->r_dir == NULL)
2400 return;
2401
2402 rdip = avl_first(rp->r_dir);
2403
2404 while (rdip != NULL) {
2405 nrdip = AVL_NEXT(rp->r_dir, rdip);
2406 avl_remove(rp->r_dir, rdip);
2407 rdip->rc.flags &= ~RDDIRCACHED;
2408 rddir4_cache_rele(rp, &rdip->rc);
2409 rdip = nrdip;
2410 }
2411 ASSERT(avl_numnodes(rp->r_dir) == 0);
2412 }
2413
2414 /*
2415 * Destroy the readdir cache.
2416 */
2417 void
rddir4_cache_destroy(rnode4_t * rp)2418 rddir4_cache_destroy(rnode4_t *rp)
2419 {
2420 ASSERT(MUTEX_HELD(&rp->r_statelock));
2421 if (rp->r_dir == NULL)
2422 return;
2423
2424 rddir4_cache_purge(rp);
2425 avl_destroy(rp->r_dir);
2426 kmem_free(rp->r_dir, sizeof (avl_tree_t));
2427 rp->r_dir = NULL;
2428 }
2429
2430 /*
2431 * Locate a readdir response from the readdir cache.
2432 *
2433 * Return values:
2434 *
2435 * NULL - If there is an unrecoverable situation like the operation may have
2436 * been interrupted.
2437 *
2438 * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
2439 * The flags are set approprately, such that the caller knows
2440 * what state the entry is in.
2441 */
2442 rddir4_cache *
rddir4_cache_lookup(rnode4_t * rp,offset_t cookie,int count)2443 rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
2444 {
2445 rddir4_cache_impl *rdip = NULL;
2446 rddir4_cache_impl srdip;
2447 rddir4_cache *srdc;
2448 rddir4_cache *rdc = NULL;
2449 rddir4_cache *nrdc = NULL;
2450 avl_index_t where;
2451
2452 top:
2453 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2454 ASSERT(MUTEX_HELD(&rp->r_statelock));
2455 /*
2456 * Check to see if the readdir cache has been disabled. If so, then
2457 * simply allocate an rddir4_cache entry and return it, since caching
2458 * operations do not apply.
2459 */
2460 if (rp->r_dir == NULL) {
2461 if (nrdc == NULL) {
2462 /*
2463 * Drop the lock because we are doing a sleeping
2464 * allocation.
2465 */
2466 mutex_exit(&rp->r_statelock);
2467 rdc = rddir4_cache_alloc(KM_SLEEP);
2468 rdc->nfs4_cookie = cookie;
2469 rdc->buflen = count;
2470 mutex_enter(&rp->r_statelock);
2471 return (rdc);
2472 }
2473 return (nrdc);
2474 }
2475
2476 srdc = &srdip.rc;
2477 srdc->nfs4_cookie = cookie;
2478 srdc->buflen = count;
2479
2480 rdip = avl_find(rp->r_dir, &srdip, &where);
2481
2482 /*
2483 * If we didn't find an entry then create one and insert it
2484 * into the cache.
2485 */
2486 if (rdip == NULL) {
2487 /*
2488 * Check for the case where we have made a second pass through
2489 * the cache due to a lockless allocation. If we find that no
2490 * thread has already inserted this entry, do the insert now
2491 * and return.
2492 */
2493 if (nrdc != NULL) {
2494 avl_insert(rp->r_dir, nrdc->data, where);
2495 nrdc->flags |= RDDIRCACHED;
2496 rddir4_cache_hold(nrdc);
2497 return (nrdc);
2498 }
2499
2500 #ifdef DEBUG
2501 nfs4_readdir_cache_misses++;
2502 #endif
2503 /*
2504 * First, try to allocate an entry without sleeping. If that
2505 * fails then drop the lock and do a sleeping allocation.
2506 */
2507 nrdc = rddir4_cache_alloc(KM_NOSLEEP);
2508 if (nrdc != NULL) {
2509 nrdc->nfs4_cookie = cookie;
2510 nrdc->buflen = count;
2511 avl_insert(rp->r_dir, nrdc->data, where);
2512 nrdc->flags |= RDDIRCACHED;
2513 rddir4_cache_hold(nrdc);
2514 return (nrdc);
2515 }
2516
2517 /*
2518 * Drop the lock and do a sleeping allocation. We incur
2519 * additional overhead by having to search the cache again,
2520 * but this case should be rare.
2521 */
2522 mutex_exit(&rp->r_statelock);
2523 nrdc = rddir4_cache_alloc(KM_SLEEP);
2524 nrdc->nfs4_cookie = cookie;
2525 nrdc->buflen = count;
2526 mutex_enter(&rp->r_statelock);
2527 /*
2528 * We need to take another pass through the cache
2529 * since we dropped our lock to perform the alloc.
2530 * Another thread may have come by and inserted the
2531 * entry we are interested in.
2532 */
2533 goto top;
2534 }
2535
2536 /*
2537 * Check to see if we need to free our entry. This can happen if
2538 * another thread came along beat us to the insert. We can
2539 * safely call rddir4_cache_free directly because no other thread
2540 * would have a reference to this entry.
2541 */
2542 if (nrdc != NULL)
2543 rddir4_cache_free((rddir4_cache_impl *)nrdc->data);
2544
2545 #ifdef DEBUG
2546 nfs4_readdir_cache_hits++;
2547 #endif
2548 /*
2549 * Found something. Make sure it's ready to return.
2550 */
2551 rdc = &rdip->rc;
2552 rddir4_cache_hold(rdc);
2553 /*
2554 * If the cache entry is in the process of being filled in, wait
2555 * until this completes. The RDDIRWAIT bit is set to indicate that
2556 * someone is waiting and when the thread currently filling the entry
2557 * is done, it should do a cv_broadcast to wakeup all of the threads
2558 * waiting for it to finish. If the thread wakes up to find that
2559 * someone new is now trying to complete the the entry, go back
2560 * to sleep.
2561 */
2562 while (rdc->flags & RDDIR) {
2563 /*
2564 * The entry is not complete.
2565 */
2566 nfs_rw_exit(&rp->r_rwlock);
2567 rdc->flags |= RDDIRWAIT;
2568 #ifdef DEBUG
2569 nfs4_readdir_cache_waits++;
2570 #endif
2571 while (rdc->flags & RDDIRWAIT) {
2572 if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2573 /*
2574 * We got interrupted, probably the user
2575 * typed ^C or an alarm fired. We free the
2576 * new entry if we allocated one.
2577 */
2578 rddir4_cache_rele(rp, rdc);
2579 mutex_exit(&rp->r_statelock);
2580 (void) nfs_rw_enter_sig(&rp->r_rwlock,
2581 RW_READER, FALSE);
2582 mutex_enter(&rp->r_statelock);
2583 return (NULL);
2584 }
2585 }
2586 mutex_exit(&rp->r_statelock);
2587 (void) nfs_rw_enter_sig(&rp->r_rwlock,
2588 RW_READER, FALSE);
2589 mutex_enter(&rp->r_statelock);
2590 }
2591
2592 /*
2593 * The entry we were waiting on may have been purged from
2594 * the cache and should no longer be used, release it and
2595 * start over.
2596 */
2597 if (!(rdc->flags & RDDIRCACHED)) {
2598 rddir4_cache_rele(rp, rdc);
2599 goto top;
2600 }
2601
2602 /*
2603 * The entry is completed. Return it.
2604 */
2605 return (rdc);
2606 }
2607
2608 /*
2609 * Allocate a cache element and return it. Can return NULL if memory is
2610 * low.
2611 */
2612 static rddir4_cache *
rddir4_cache_alloc(int flags)2613 rddir4_cache_alloc(int flags)
2614 {
2615 rddir4_cache_impl *rdip = NULL;
2616 rddir4_cache *rc = NULL;
2617
2618 rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);
2619
2620 if (rdip != NULL) {
2621 rc = &rdip->rc;
2622 rc->data = (void *)rdip;
2623 rc->nfs4_cookie = 0;
2624 rc->nfs4_ncookie = 0;
2625 rc->entries = NULL;
2626 rc->eof = 0;
2627 rc->entlen = 0;
2628 rc->buflen = 0;
2629 rc->actlen = 0;
2630 /*
2631 * A readdir is required so set the flag.
2632 */
2633 rc->flags = RDDIRREQ;
2634 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
2635 rc->error = 0;
2636 mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
2637 rdip->count = 1;
2638 #ifdef DEBUG
2639 atomic_add_64(&clstat4_debug.dirent.value.ui64, 1);
2640 #endif
2641 }
2642 return (rc);
2643 }
2644
2645 /*
2646 * Increment the reference count to this cache element.
2647 */
2648 static void
rddir4_cache_hold(rddir4_cache * rc)2649 rddir4_cache_hold(rddir4_cache *rc)
2650 {
2651 rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;
2652
2653 mutex_enter(&rdip->lock);
2654 rdip->count++;
2655 mutex_exit(&rdip->lock);
2656 }
2657
2658 /*
2659 * Release a reference to this cache element. If the count is zero then
2660 * free the element.
2661 */
2662 void
rddir4_cache_rele(rnode4_t * rp,rddir4_cache * rdc)2663 rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
2664 {
2665 rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;
2666
2667 ASSERT(MUTEX_HELD(&rp->r_statelock));
2668
2669 /*
2670 * Check to see if we have any waiters. If so, we can wake them
2671 * so that they can proceed.
2672 */
2673 if (rdc->flags & RDDIRWAIT) {
2674 rdc->flags &= ~RDDIRWAIT;
2675 cv_broadcast(&rdc->cv);
2676 }
2677
2678 mutex_enter(&rdip->lock);
2679 ASSERT(rdip->count > 0);
2680 if (--rdip->count == 0) {
2681 mutex_exit(&rdip->lock);
2682 rddir4_cache_free(rdip);
2683 } else
2684 mutex_exit(&rdip->lock);
2685 }
2686
2687 /*
2688 * Free a cache element.
2689 */
2690 static void
rddir4_cache_free(rddir4_cache_impl * rdip)2691 rddir4_cache_free(rddir4_cache_impl *rdip)
2692 {
2693 rddir4_cache *rc = &rdip->rc;
2694
2695 #ifdef DEBUG
2696 atomic_add_64(&clstat4_debug.dirent.value.ui64, -1);
2697 #endif
2698 if (rc->entries != NULL)
2699 kmem_free(rc->entries, rc->buflen);
2700 cv_destroy(&rc->cv);
2701 mutex_destroy(&rdip->lock);
2702 kmem_free(rdip, sizeof (*rdip));
2703 }
2704
2705 /*
2706 * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
2707 * framework.
2708 */
2709 static int
cl4_snapshot(kstat_t * ksp,void * buf,int rw)2710 cl4_snapshot(kstat_t *ksp, void *buf, int rw)
2711 {
2712 ksp->ks_snaptime = gethrtime();
2713 if (rw == KSTAT_WRITE) {
2714 bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
2715 #ifdef DEBUG
2716 /*
2717 * Currently only the global zone can write to kstats, but we
2718 * add the check just for paranoia.
2719 */
2720 if (INGLOBALZONE(curproc))
2721 bcopy((char *)buf + sizeof (clstat4_tmpl),
2722 &clstat4_debug, sizeof (clstat4_debug));
2723 #endif
2724 } else {
2725 bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
2726 #ifdef DEBUG
2727 /*
2728 * If we're displaying the "global" debug kstat values, we
2729 * display them as-is to all zones since in fact they apply to
2730 * the system as a whole.
2731 */
2732 bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl),
2733 sizeof (clstat4_debug));
2734 #endif
2735 }
2736 return (0);
2737 }
2738
2739
2740
2741 /*
2742 * Zone support
2743 */
2744 static void *
clinit4_zone(zoneid_t zoneid)2745 clinit4_zone(zoneid_t zoneid)
2746 {
2747 kstat_t *nfs4_client_kstat;
2748 struct nfs4_clnt *nfscl;
2749 uint_t ndata;
2750
2751 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
2752 mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
2753 nfscl->nfscl_chtable4 = NULL;
2754 nfscl->nfscl_zoneid = zoneid;
2755
2756 bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl));
2757 ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
2758 #ifdef DEBUG
2759 ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t);
2760 #endif
2761 if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
2762 "misc", KSTAT_TYPE_NAMED, ndata,
2763 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
2764 nfs4_client_kstat->ks_private = &nfscl->nfscl_stat;
2765 nfs4_client_kstat->ks_snapshot = cl4_snapshot;
2766 kstat_install(nfs4_client_kstat);
2767 }
2768 mutex_enter(&nfs4_clnt_list_lock);
2769 list_insert_head(&nfs4_clnt_list, nfscl);
2770 mutex_exit(&nfs4_clnt_list_lock);
2771
2772 return (nfscl);
2773 }
2774
2775 /*ARGSUSED*/
2776 static void
clfini4_zone(zoneid_t zoneid,void * arg)2777 clfini4_zone(zoneid_t zoneid, void *arg)
2778 {
2779 struct nfs4_clnt *nfscl = arg;
2780 chhead_t *chp, *next;
2781
2782 if (nfscl == NULL)
2783 return;
2784 mutex_enter(&nfs4_clnt_list_lock);
2785 list_remove(&nfs4_clnt_list, nfscl);
2786 mutex_exit(&nfs4_clnt_list_lock);
2787 clreclaim4_zone(nfscl, 0);
2788 for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
2789 ASSERT(chp->ch_list == NULL);
2790 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
2791 next = chp->ch_next;
2792 kmem_free(chp, sizeof (*chp));
2793 }
2794 kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
2795 mutex_destroy(&nfscl->nfscl_chtable4_lock);
2796 kmem_free(nfscl, sizeof (*nfscl));
2797 }
2798
2799 /*
2800 * Called by endpnt_destructor to make sure the client handles are
2801 * cleaned up before the RPC endpoints. This becomes a no-op if
2802 * clfini_zone (above) is called first. This function is needed
2803 * (rather than relying on clfini_zone to clean up) because the ZSD
2804 * callbacks have no ordering mechanism, so we have no way to ensure
2805 * that clfini_zone is called before endpnt_destructor.
2806 */
2807 void
clcleanup4_zone(zoneid_t zoneid)2808 clcleanup4_zone(zoneid_t zoneid)
2809 {
2810 struct nfs4_clnt *nfscl;
2811
2812 mutex_enter(&nfs4_clnt_list_lock);
2813 nfscl = list_head(&nfs4_clnt_list);
2814 for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
2815 if (nfscl->nfscl_zoneid == zoneid) {
2816 clreclaim4_zone(nfscl, 0);
2817 break;
2818 }
2819 }
2820 mutex_exit(&nfs4_clnt_list_lock);
2821 }
2822
2823 int
nfs4_subr_init(void)2824 nfs4_subr_init(void)
2825 {
2826 /*
2827 * Allocate and initialize the client handle cache
2828 */
2829 chtab4_cache = kmem_cache_create("client_handle4_cache",
2830 sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
2831 NULL, 0);
2832
2833 /*
2834 * Initialize the list of per-zone client handles (and associated data).
2835 * This needs to be done before we call zone_key_create().
2836 */
2837 list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
2838 offsetof(struct nfs4_clnt, nfscl_node));
2839
2840 /*
2841 * Initialize the zone_key for per-zone client handle lists.
2842 */
2843 zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);
2844
2845 if (nfs4err_delay_time == 0)
2846 nfs4err_delay_time = NFS4ERR_DELAY_TIME;
2847
2848 return (0);
2849 }
2850
2851 int
nfs4_subr_fini(void)2852 nfs4_subr_fini(void)
2853 {
2854 /*
2855 * Deallocate the client handle cache
2856 */
2857 kmem_cache_destroy(chtab4_cache);
2858
2859 /*
2860 * Destroy the zone_key
2861 */
2862 (void) zone_key_delete(nfs4clnt_zone_key);
2863
2864 return (0);
2865 }
2866 /*
2867 * Set or Clear direct I/O flag
2868 * VOP_RWLOCK() is held for write access to prevent a race condition
2869 * which would occur if a process is in the middle of a write when
2870 * directio flag gets set. It is possible that all pages may not get flushed.
2871 *
2872 * This is a copy of nfs_directio, changes here may need to be made
2873 * there and vice versa.
2874 */
2875
2876 int
nfs4_directio(vnode_t * vp,int cmd,cred_t * cr)2877 nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
2878 {
2879 int error = 0;
2880 rnode4_t *rp;
2881
2882 rp = VTOR4(vp);
2883
2884 if (cmd == DIRECTIO_ON) {
2885
2886 if (rp->r_flags & R4DIRECTIO)
2887 return (0);
2888
2889 /*
2890 * Flush the page cache.
2891 */
2892
2893 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
2894
2895 if (rp->r_flags & R4DIRECTIO) {
2896 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2897 return (0);
2898 }
2899
2900 if (nfs4_has_pages(vp) &&
2901 ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
2902 error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
2903 B_INVAL, cr, NULL);
2904 if (error) {
2905 if (error == ENOSPC || error == EDQUOT) {
2906 mutex_enter(&rp->r_statelock);
2907 if (!rp->r_error)
2908 rp->r_error = error;
2909 mutex_exit(&rp->r_statelock);
2910 }
2911 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2912 return (error);
2913 }
2914 }
2915
2916 mutex_enter(&rp->r_statelock);
2917 rp->r_flags |= R4DIRECTIO;
2918 mutex_exit(&rp->r_statelock);
2919 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
2920 return (0);
2921 }
2922
2923 if (cmd == DIRECTIO_OFF) {
2924 mutex_enter(&rp->r_statelock);
2925 rp->r_flags &= ~R4DIRECTIO; /* disable direct mode */
2926 mutex_exit(&rp->r_statelock);
2927 return (0);
2928 }
2929
2930 return (EINVAL);
2931 }
2932
2933 /*
2934 * Return TRUE if the file has any pages. Always go back to
2935 * the master vnode to check v_pages since none of the shadows
2936 * can have pages.
2937 */
2938
2939 bool_t
nfs4_has_pages(vnode_t * vp)2940 nfs4_has_pages(vnode_t *vp)
2941 {
2942 rnode4_t *rp;
2943
2944 rp = VTOR4(vp);
2945 if (IS_SHADOW(vp, rp))
2946 vp = RTOV4(rp); /* RTOV4 always gives the master */
2947
2948 return (vn_has_cached_data(vp));
2949 }
2950
2951 /*
2952 * This table is used to determine whether the client should attempt
2953 * failover based on the clnt_stat value returned by CLNT_CALL. The
2954 * clnt_stat is used as an index into the table. If
2955 * the error value that corresponds to the clnt_stat value in the
2956 * table is non-zero, then that is the error to be returned AND
2957 * that signals that failover should be attempted.
2958 *
2959 * Special note: If the RPC_ values change, then direct indexing of the
2960 * table is no longer valid, but having the RPC_ values in the table
2961 * allow the functions to detect the change and issue a warning.
2962 * In this case, the code will always attempt failover as a defensive
2963 * measure.
2964 */
2965
2966 static struct try_failover_tab {
2967 enum clnt_stat cstat;
2968 int error;
2969 } try_failover_table [] = {
2970
2971 RPC_SUCCESS, 0,
2972 RPC_CANTENCODEARGS, 0,
2973 RPC_CANTDECODERES, 0,
2974 RPC_CANTSEND, ECOMM,
2975 RPC_CANTRECV, ECOMM,
2976 RPC_TIMEDOUT, ETIMEDOUT,
2977 RPC_VERSMISMATCH, 0,
2978 RPC_AUTHERROR, 0,
2979 RPC_PROGUNAVAIL, 0,
2980 RPC_PROGVERSMISMATCH, 0,
2981 RPC_PROCUNAVAIL, 0,
2982 RPC_CANTDECODEARGS, 0,
2983 RPC_SYSTEMERROR, ENOSR,
2984 RPC_UNKNOWNHOST, EHOSTUNREACH,
2985 RPC_RPCBFAILURE, ENETUNREACH,
2986 RPC_PROGNOTREGISTERED, ECONNREFUSED,
2987 RPC_FAILED, ETIMEDOUT,
2988 RPC_UNKNOWNPROTO, EHOSTUNREACH,
2989 RPC_INTR, 0,
2990 RPC_UNKNOWNADDR, EHOSTUNREACH,
2991 RPC_TLIERROR, 0,
2992 RPC_NOBROADCAST, EHOSTUNREACH,
2993 RPC_N2AXLATEFAILURE, ECONNREFUSED,
2994 RPC_UDERROR, 0,
2995 RPC_INPROGRESS, 0,
2996 RPC_STALERACHANDLE, EINVAL,
2997 RPC_CANTCONNECT, ECONNREFUSED,
2998 RPC_XPRTFAILED, ECONNABORTED,
2999 RPC_CANTCREATESTREAM, ECONNREFUSED,
3000 RPC_CANTSTORE, ENOBUFS
3001 };
3002
3003 /*
3004 * nfs4_try_failover - determine whether the client should
3005 * attempt failover based on the values stored in the nfs4_error_t.
3006 */
3007 int
nfs4_try_failover(nfs4_error_t * ep)3008 nfs4_try_failover(nfs4_error_t *ep)
3009 {
3010 if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
3011 return (TRUE);
3012
3013 if (ep->error && ep->rpc_status != RPC_SUCCESS)
3014 return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);
3015
3016 return (FALSE);
3017 }
3018
3019 /*
3020 * try_failover - internal version of nfs4_try_failover, called
3021 * only by rfscall and aclcall. Determine if failover is warranted
3022 * based on the clnt_stat and return the error number if it is.
3023 */
3024 static int
try_failover(enum clnt_stat rpc_status)3025 try_failover(enum clnt_stat rpc_status)
3026 {
3027 int err = 0;
3028
3029 if (rpc_status == RPC_SUCCESS)
3030 return (0);
3031
3032 #ifdef DEBUG
3033 if (rpc_status != 0 && nfs4_try_failover_any) {
3034 err = ETIMEDOUT;
3035 goto done;
3036 }
3037 #endif
3038 /*
3039 * The rpc status is used as an index into the table.
3040 * If the rpc status is outside of the range of the
3041 * table or if the rpc error numbers have been changed
3042 * since the table was constructed, then print a warning
3043 * (DEBUG only) and try failover anyway. Otherwise, just
3044 * grab the resulting error number out of the table.
3045 */
3046 if (rpc_status < RPC_SUCCESS || rpc_status >=
3047 sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
3048 try_failover_table[rpc_status].cstat != rpc_status) {
3049
3050 err = ETIMEDOUT;
3051 #ifdef DEBUG
3052 cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
3053 rpc_status);
3054 #endif
3055 } else
3056 err = try_failover_table[rpc_status].error;
3057
3058 done:
3059 if (rpc_status)
3060 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3061 "nfs4_try_failover: %strying failover on error %d",
3062 err ? "" : "NOT ", rpc_status));
3063
3064 return (err);
3065 }
3066
3067 void
nfs4_error_zinit(nfs4_error_t * ep)3068 nfs4_error_zinit(nfs4_error_t *ep)
3069 {
3070 ep->error = 0;
3071 ep->stat = NFS4_OK;
3072 ep->rpc_status = RPC_SUCCESS;
3073 }
3074
3075 void
nfs4_error_init(nfs4_error_t * ep,int error)3076 nfs4_error_init(nfs4_error_t *ep, int error)
3077 {
3078 ep->error = error;
3079 ep->stat = NFS4_OK;
3080 ep->rpc_status = RPC_SUCCESS;
3081 }
3082
3083
3084 #ifdef DEBUG
3085
3086 /*
3087 * Return a 16-bit hash for filehandle, stateid, clientid, owner.
3088 * use the same algorithm as for NFS v3.
3089 *
3090 */
3091 int
hash16(void * p,int len)3092 hash16(void *p, int len)
3093 {
3094 int i, rem;
3095 uint_t *wp;
3096 uint_t key = 0;
3097
3098 /* protect against non word aligned */
3099 if ((rem = len & 3) != 0)
3100 len &= ~3;
3101
3102 for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
3103 key ^= (*wp >> 16) ^ *wp;
3104 }
3105
3106 /* hash left-over bytes */
3107 for (i = 0; i < rem; i++)
3108 key ^= *((uchar_t *)p + i);
3109
3110 return (key & 0xffff);
3111 }
3112
3113 /*
3114 * rnode4info - return filehandle and path information for an rnode.
3115 * XXX MT issues: uses a single static buffer, no locking of path.
3116 */
3117 char *
rnode4info(rnode4_t * rp)3118 rnode4info(rnode4_t *rp)
3119 {
3120 static char buf[80];
3121 nfs4_fhandle_t fhandle;
3122 char *path;
3123 char *type;
3124
3125 if (rp == NULL)
3126 return ("null");
3127 if (rp->r_flags & R4ISXATTR)
3128 type = "attr";
3129 else if (RTOV4(rp)->v_flag & V_XATTRDIR)
3130 type = "attrdir";
3131 else if (RTOV4(rp)->v_flag & VROOT)
3132 type = "root";
3133 else if (RTOV4(rp)->v_type == VDIR)
3134 type = "dir";
3135 else if (RTOV4(rp)->v_type == VREG)
3136 type = "file";
3137 else
3138 type = "other";
3139 sfh4_copyval(rp->r_fh, &fhandle);
3140 path = fn_path(rp->r_svnode.sv_name);
3141 (void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
3142 (void *)rp, path, type, rp->r_flags,
3143 hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
3144 kmem_free(path, strlen(path)+1);
3145 return (buf);
3146 }
3147 #endif
3148