1*0Sstevel@tonic-gate /* 2*0Sstevel@tonic-gate * CDDL HEADER START 3*0Sstevel@tonic-gate * 4*0Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5*0Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 6*0Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 7*0Sstevel@tonic-gate * with the License. 8*0Sstevel@tonic-gate * 9*0Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*0Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 11*0Sstevel@tonic-gate * See the License for the specific language governing permissions 12*0Sstevel@tonic-gate * and limitations under the License. 13*0Sstevel@tonic-gate * 14*0Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 15*0Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*0Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 17*0Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 18*0Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 19*0Sstevel@tonic-gate * 20*0Sstevel@tonic-gate * CDDL HEADER END 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate /* 23*0Sstevel@tonic-gate * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24*0Sstevel@tonic-gate * Use is subject to license terms. 25*0Sstevel@tonic-gate */ 26*0Sstevel@tonic-gate 27*0Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28*0Sstevel@tonic-gate /* All Rights Reserved */ 29*0Sstevel@tonic-gate 30*0Sstevel@tonic-gate /* 31*0Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988 32*0Sstevel@tonic-gate * The Regents of the University of California 33*0Sstevel@tonic-gate * All Rights Reserved 34*0Sstevel@tonic-gate * 35*0Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from 36*0Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its 37*0Sstevel@tonic-gate * contributors. 38*0Sstevel@tonic-gate */ 39*0Sstevel@tonic-gate 40*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 41*0Sstevel@tonic-gate 42*0Sstevel@tonic-gate #include <sys/types.h> 43*0Sstevel@tonic-gate #include <sys/t_lock.h> 44*0Sstevel@tonic-gate #include <sys/param.h> 45*0Sstevel@tonic-gate #include <sys/errno.h> 46*0Sstevel@tonic-gate #include <sys/debug.h> 47*0Sstevel@tonic-gate #include <sys/cmn_err.h> 48*0Sstevel@tonic-gate #include <sys/kmem.h> 49*0Sstevel@tonic-gate #include <sys/sysmacros.h> 50*0Sstevel@tonic-gate #include <sys/inline.h> 51*0Sstevel@tonic-gate #include <sys/buf.h> 52*0Sstevel@tonic-gate #include <sys/uio.h> 53*0Sstevel@tonic-gate #include <sys/user.h> 54*0Sstevel@tonic-gate #include <sys/proc.h> 55*0Sstevel@tonic-gate #include <sys/systm.h> 56*0Sstevel@tonic-gate #include <sys/vmsystm.h> 57*0Sstevel@tonic-gate #include <sys/cpuvar.h> 58*0Sstevel@tonic-gate #include <sys/mman.h> 59*0Sstevel@tonic-gate #include <sys/cred.h> 60*0Sstevel@tonic-gate #include <sys/vnode.h> 61*0Sstevel@tonic-gate #include <sys/file.h> 62*0Sstevel@tonic-gate #include <sys/vm.h> 63*0Sstevel@tonic-gate 64*0Sstevel@tonic-gate #include <sys/swap.h> 65*0Sstevel@tonic-gate #include <sys/vtrace.h> 66*0Sstevel@tonic-gate #include <sys/tnf_probe.h> 67*0Sstevel@tonic-gate #include <sys/fs/snode.h> 68*0Sstevel@tonic-gate #include <sys/copyops.h> 69*0Sstevel@tonic-gate #include <sys/conf.h> 70*0Sstevel@tonic-gate #include <sys/sdt.h> 71*0Sstevel@tonic-gate 72*0Sstevel@tonic-gate #include <vm/anon.h> 73*0Sstevel@tonic-gate #include <vm/hat.h> 74*0Sstevel@tonic-gate #include <vm/as.h> 75*0Sstevel@tonic-gate #include <vm/seg.h> 76*0Sstevel@tonic-gate #include <vm/page.h> 77*0Sstevel@tonic-gate #include <vm/seg_vn.h> 78*0Sstevel@tonic-gate #include <vm/seg_kmem.h> 79*0Sstevel@tonic-gate 80*0Sstevel@tonic-gate extern int maxphys; 81*0Sstevel@tonic-gate 82*0Sstevel@tonic-gate void 83*0Sstevel@tonic-gate minphys(struct buf *bp) 84*0Sstevel@tonic-gate { 85*0Sstevel@tonic-gate if (bp->b_bcount > maxphys) 86*0Sstevel@tonic-gate bp->b_bcount = maxphys; 87*0Sstevel@tonic-gate } 88*0Sstevel@tonic-gate 89*0Sstevel@tonic-gate /* 90*0Sstevel@tonic-gate * use kmem_cache_create for physio buffers. This has shown 91*0Sstevel@tonic-gate * a better cache distribution compared to buffers on the 92*0Sstevel@tonic-gate * stack. It also avoids semaphore construction/deconstruction 93*0Sstevel@tonic-gate * per request 94*0Sstevel@tonic-gate */ 95*0Sstevel@tonic-gate 96*0Sstevel@tonic-gate static struct kmem_cache *physio_buf_cache; 97*0Sstevel@tonic-gate 98*0Sstevel@tonic-gate /* ARGSUSED */ 99*0Sstevel@tonic-gate static int 100*0Sstevel@tonic-gate physio_buf_constructor(void *buf, void *cdrarg, int kmflags) 101*0Sstevel@tonic-gate { 102*0Sstevel@tonic-gate bioinit((struct buf *)buf); 103*0Sstevel@tonic-gate return (0); 104*0Sstevel@tonic-gate } 105*0Sstevel@tonic-gate 106*0Sstevel@tonic-gate /* ARGSUSED */ 107*0Sstevel@tonic-gate static void 108*0Sstevel@tonic-gate physio_buf_destructor(void *buf, void *cdrarg) 109*0Sstevel@tonic-gate { 110*0Sstevel@tonic-gate biofini((struct buf *)buf); 111*0Sstevel@tonic-gate } 112*0Sstevel@tonic-gate 113*0Sstevel@tonic-gate void 114*0Sstevel@tonic-gate physio_bufs_init(void) 115*0Sstevel@tonic-gate { 116*0Sstevel@tonic-gate physio_buf_cache = kmem_cache_create("physio_buf_cache", 117*0Sstevel@tonic-gate sizeof (struct buf), 0, 118*0Sstevel@tonic-gate physio_buf_constructor, physio_buf_destructor, 119*0Sstevel@tonic-gate NULL, NULL, NULL, 0); 120*0Sstevel@tonic-gate } 121*0Sstevel@tonic-gate 122*0Sstevel@tonic-gate 123*0Sstevel@tonic-gate 124*0Sstevel@tonic-gate /* 125*0Sstevel@tonic-gate * initiate raw I/O request 126*0Sstevel@tonic-gate * 127*0Sstevel@tonic-gate * allocate buf header if necessary 128*0Sstevel@tonic-gate * adjust max size of each I/O request 129*0Sstevel@tonic-gate * lock down user pages and verify access protections 130*0Sstevel@tonic-gate * call driver's strategy routine to submit request 131*0Sstevel@tonic-gate * wait for I/O completion 132*0Sstevel@tonic-gate * unlock user pages and free allocated buf header 133*0Sstevel@tonic-gate */ 134*0Sstevel@tonic-gate 135*0Sstevel@tonic-gate int 136*0Sstevel@tonic-gate default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev, 137*0Sstevel@tonic-gate int rw, void (*mincnt)(struct buf *), struct uio *uio) 138*0Sstevel@tonic-gate { 139*0Sstevel@tonic-gate struct iovec *iov; 140*0Sstevel@tonic-gate struct proc *procp; 141*0Sstevel@tonic-gate struct as *asp; 142*0Sstevel@tonic-gate ssize_t c; 143*0Sstevel@tonic-gate char *a; 144*0Sstevel@tonic-gate int error = 0; 145*0Sstevel@tonic-gate page_t **pplist; 146*0Sstevel@tonic-gate int allocbuf = 0; 147*0Sstevel@tonic-gate 148*0Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp); 149*0Sstevel@tonic-gate 150*0Sstevel@tonic-gate /* Kernel probe */ 151*0Sstevel@tonic-gate TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */, 152*0Sstevel@tonic-gate tnf_device, device, dev, 153*0Sstevel@tonic-gate tnf_offset, offset, uio->uio_loffset, 154*0Sstevel@tonic-gate tnf_size, size, uio->uio_resid, 155*0Sstevel@tonic-gate tnf_bioflags, rw, rw); 156*0Sstevel@tonic-gate 157*0Sstevel@tonic-gate if (rw == B_READ) { 158*0Sstevel@tonic-gate CPU_STATS_ADD_K(sys, phread, 1); 159*0Sstevel@tonic-gate } else { 160*0Sstevel@tonic-gate CPU_STATS_ADD_K(sys, phwrite, 1); 161*0Sstevel@tonic-gate } 162*0Sstevel@tonic-gate 163*0Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START, 164*0Sstevel@tonic-gate "getbuf_start: bp %p", bp); 165*0Sstevel@tonic-gate 166*0Sstevel@tonic-gate if (bp == NULL) { 167*0Sstevel@tonic-gate bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP); 168*0Sstevel@tonic-gate bp->b_iodone = NULL; 169*0Sstevel@tonic-gate bp->b_resid = 0; 170*0Sstevel@tonic-gate allocbuf = 1; 171*0Sstevel@tonic-gate } 172*0Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp); 173*0Sstevel@tonic-gate 174*0Sstevel@tonic-gate if (uio->uio_segflg == UIO_USERSPACE) { 175*0Sstevel@tonic-gate procp = ttoproc(curthread); 176*0Sstevel@tonic-gate asp = procp->p_as; 177*0Sstevel@tonic-gate } else { 178*0Sstevel@tonic-gate procp = NULL; 179*0Sstevel@tonic-gate asp = &kas; 180*0Sstevel@tonic-gate } 181*0Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem)); 182*0Sstevel@tonic-gate 183*0Sstevel@tonic-gate /* 184*0Sstevel@tonic-gate * We need to prepare this buffer for the io:::start probe, including 185*0Sstevel@tonic-gate * NULL'ing out the file, clearing the offset, and filling in the 186*0Sstevel@tonic-gate * b_dip field. 187*0Sstevel@tonic-gate */ 188*0Sstevel@tonic-gate bp->b_file = NULL; 189*0Sstevel@tonic-gate bp->b_offset = -1; 190*0Sstevel@tonic-gate 191*0Sstevel@tonic-gate if (dev != NODEV) { 192*0Sstevel@tonic-gate (void) devopsp[getmajor(dev)]->devo_getinfo(NULL, 193*0Sstevel@tonic-gate DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip); 194*0Sstevel@tonic-gate } else { 195*0Sstevel@tonic-gate bp->b_dip = NULL; 196*0Sstevel@tonic-gate } 197*0Sstevel@tonic-gate 198*0Sstevel@tonic-gate while (uio->uio_iovcnt > 0) { 199*0Sstevel@tonic-gate iov = uio->uio_iov; 200*0Sstevel@tonic-gate 201*0Sstevel@tonic-gate bp->b_error = 0; 202*0Sstevel@tonic-gate bp->b_proc = procp; 203*0Sstevel@tonic-gate 204*0Sstevel@tonic-gate while (iov->iov_len > 0) { 205*0Sstevel@tonic-gate if (uio->uio_resid == 0) 206*0Sstevel@tonic-gate break; 207*0Sstevel@tonic-gate if (uio->uio_loffset < 0) { 208*0Sstevel@tonic-gate error = EINVAL; 209*0Sstevel@tonic-gate break; 210*0Sstevel@tonic-gate } 211*0Sstevel@tonic-gate #ifdef _ILP32 212*0Sstevel@tonic-gate /* 213*0Sstevel@tonic-gate * For 32-bit kernels, check against SPEC_MAXOFFSET_T 214*0Sstevel@tonic-gate * which represents the maximum size that can be 215*0Sstevel@tonic-gate * supported by the IO subsystem. 216*0Sstevel@tonic-gate * XXX this code assumes a D_64BIT driver. 217*0Sstevel@tonic-gate */ 218*0Sstevel@tonic-gate if (uio->uio_loffset > SPEC_MAXOFFSET_T) { 219*0Sstevel@tonic-gate error = EINVAL; 220*0Sstevel@tonic-gate break; 221*0Sstevel@tonic-gate } 222*0Sstevel@tonic-gate #endif /* _ILP32 */ 223*0Sstevel@tonic-gate bp->b_flags = B_BUSY | B_PHYS | rw; 224*0Sstevel@tonic-gate bp->b_edev = dev; 225*0Sstevel@tonic-gate bp->b_lblkno = btodt(uio->uio_loffset); 226*0Sstevel@tonic-gate 227*0Sstevel@tonic-gate /* 228*0Sstevel@tonic-gate * Don't count on b_addr remaining untouched by the 229*0Sstevel@tonic-gate * code below (it may be reset because someone does 230*0Sstevel@tonic-gate * a bp_mapin on the buffer) -- reset from the iov 231*0Sstevel@tonic-gate * each time through, updating the iov's base address 232*0Sstevel@tonic-gate * instead. 233*0Sstevel@tonic-gate */ 234*0Sstevel@tonic-gate a = bp->b_un.b_addr = iov->iov_base; 235*0Sstevel@tonic-gate bp->b_bcount = MIN(iov->iov_len, uio->uio_resid); 236*0Sstevel@tonic-gate (*mincnt)(bp); 237*0Sstevel@tonic-gate c = bp->b_bcount; 238*0Sstevel@tonic-gate 239*0Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START, 240*0Sstevel@tonic-gate "as_pagelock_start: bp %p", bp); 241*0Sstevel@tonic-gate 242*0Sstevel@tonic-gate error = as_pagelock(asp, &pplist, a, 243*0Sstevel@tonic-gate c, rw == B_READ? S_WRITE : S_READ); 244*0Sstevel@tonic-gate 245*0Sstevel@tonic-gate TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END, 246*0Sstevel@tonic-gate "as_pagelock_end:"); 247*0Sstevel@tonic-gate 248*0Sstevel@tonic-gate if (error != 0) { 249*0Sstevel@tonic-gate bp->b_flags |= B_ERROR; 250*0Sstevel@tonic-gate bp->b_error = error; 251*0Sstevel@tonic-gate bp->b_flags &= 252*0Sstevel@tonic-gate ~(B_BUSY|B_WANTED|B_PHYS); 253*0Sstevel@tonic-gate break; 254*0Sstevel@tonic-gate } 255*0Sstevel@tonic-gate bp->b_shadow = pplist; 256*0Sstevel@tonic-gate if (pplist != NULL) { 257*0Sstevel@tonic-gate bp->b_flags |= B_SHADOW; 258*0Sstevel@tonic-gate } 259*0Sstevel@tonic-gate 260*0Sstevel@tonic-gate DTRACE_IO1(start, struct buf *, bp); 261*0Sstevel@tonic-gate bp->b_flags |= B_STARTED; 262*0Sstevel@tonic-gate 263*0Sstevel@tonic-gate (void) (*strat)(bp); 264*0Sstevel@tonic-gate error = biowait(bp); 265*0Sstevel@tonic-gate 266*0Sstevel@tonic-gate /* 267*0Sstevel@tonic-gate * unlock the pages 268*0Sstevel@tonic-gate */ 269*0Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START, 270*0Sstevel@tonic-gate "as_pageunlock_start: bp %p", bp); 271*0Sstevel@tonic-gate 272*0Sstevel@tonic-gate as_pageunlock(asp, pplist, a, c, 273*0Sstevel@tonic-gate rw == B_READ? S_WRITE : S_READ); 274*0Sstevel@tonic-gate 275*0Sstevel@tonic-gate TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END, 276*0Sstevel@tonic-gate "as_pageunlock_end:"); 277*0Sstevel@tonic-gate 278*0Sstevel@tonic-gate c -= bp->b_resid; 279*0Sstevel@tonic-gate iov->iov_base += c; 280*0Sstevel@tonic-gate iov->iov_len -= c; 281*0Sstevel@tonic-gate uio->uio_resid -= c; 282*0Sstevel@tonic-gate uio->uio_loffset += c; 283*0Sstevel@tonic-gate /* bp->b_resid - temp kludge for tape drives */ 284*0Sstevel@tonic-gate if (bp->b_resid || error) 285*0Sstevel@tonic-gate break; 286*0Sstevel@tonic-gate } 287*0Sstevel@tonic-gate bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW); 288*0Sstevel@tonic-gate /* bp->b_resid - temp kludge for tape drives */ 289*0Sstevel@tonic-gate if (bp->b_resid || error) 290*0Sstevel@tonic-gate break; 291*0Sstevel@tonic-gate uio->uio_iov++; 292*0Sstevel@tonic-gate uio->uio_iovcnt--; 293*0Sstevel@tonic-gate } 294*0Sstevel@tonic-gate 295*0Sstevel@tonic-gate if (allocbuf) { 296*0Sstevel@tonic-gate kmem_cache_free(physio_buf_cache, bp); 297*0Sstevel@tonic-gate } 298*0Sstevel@tonic-gate 299*0Sstevel@tonic-gate /* Kernel probe */ 300*0Sstevel@tonic-gate TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */, 301*0Sstevel@tonic-gate tnf_device, device, dev); 302*0Sstevel@tonic-gate 303*0Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp); 304*0Sstevel@tonic-gate 305*0Sstevel@tonic-gate return (error); 306*0Sstevel@tonic-gate } 307*0Sstevel@tonic-gate 308*0Sstevel@tonic-gate /* 309*0Sstevel@tonic-gate * Returns 0 on success, or an error on failure. 310*0Sstevel@tonic-gate * 311*0Sstevel@tonic-gate * This function is no longer a part of the DDI/DKI. 312*0Sstevel@tonic-gate * However, for compatibility, its interface should not 313*0Sstevel@tonic-gate * be changed and it should not be removed from the kernel. 314*0Sstevel@tonic-gate */ 315*0Sstevel@tonic-gate int 316*0Sstevel@tonic-gate useracc(void *addr, size_t count, int access) 317*0Sstevel@tonic-gate { 318*0Sstevel@tonic-gate uint_t prot; 319*0Sstevel@tonic-gate 320*0Sstevel@tonic-gate prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE); 321*0Sstevel@tonic-gate return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot)); 322*0Sstevel@tonic-gate } 323*0Sstevel@tonic-gate 324*0Sstevel@tonic-gate #define MAX_MAPIN_PAGES 8 325*0Sstevel@tonic-gate 326*0Sstevel@tonic-gate /* 327*0Sstevel@tonic-gate * This function temporarily "borrows" user pages for kernel use. If 328*0Sstevel@tonic-gate * "cow" is on, it also sets up copy-on-write protection (only feasible 329*0Sstevel@tonic-gate * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed 330*0Sstevel@tonic-gate * pages from any changes by the user. The caller is responsible for 331*0Sstevel@tonic-gate * unlocking and tearing down cow settings when it's done with the pages. 332*0Sstevel@tonic-gate * For an example, see kcfree(). 333*0Sstevel@tonic-gate * 334*0Sstevel@tonic-gate * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked 335*0Sstevel@tonic-gate * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if 336*0Sstevel@tonic-gate * kaddr != -1. On entering this function, cached_ppp contains a list 337*0Sstevel@tonic-gate * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a 338*0Sstevel@tonic-gate * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp], 339*0Sstevel@tonic-gate * the kernel map won't need to be reloaded again. 340*0Sstevel@tonic-gate * 341*0Sstevel@tonic-gate * For cow == 1, if the pages are anonymous pages, it also bumps the anon 342*0Sstevel@tonic-gate * reference count, and change the user-mapping to read-only. This 343*0Sstevel@tonic-gate * scheme should work on all types of segment drivers. But to be safe, 344*0Sstevel@tonic-gate * we check against segvn here. 345*0Sstevel@tonic-gate * 346*0Sstevel@tonic-gate * Since this function is used to emulate copyin() semantic, it checks 347*0Sstevel@tonic-gate * to make sure the user-mappings allow "user-read". 348*0Sstevel@tonic-gate * 349*0Sstevel@tonic-gate * On exit "lenp" contains the number of bytes successfully locked and 350*0Sstevel@tonic-gate * mapped in. For the unsuccessful ones, the caller can fall back to 351*0Sstevel@tonic-gate * copyin(). 352*0Sstevel@tonic-gate * 353*0Sstevel@tonic-gate * Error return: 354*0Sstevel@tonic-gate * ENOTSUP - operation like this is not supported either on this segment 355*0Sstevel@tonic-gate * type, or on this platform type. 356*0Sstevel@tonic-gate */ 357*0Sstevel@tonic-gate int 358*0Sstevel@tonic-gate cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp, 359*0Sstevel@tonic-gate struct anon **app, size_t *lenp, int cow) 360*0Sstevel@tonic-gate { 361*0Sstevel@tonic-gate struct hat *hat; 362*0Sstevel@tonic-gate struct seg *seg; 363*0Sstevel@tonic-gate caddr_t base; 364*0Sstevel@tonic-gate page_t *pp, *ppp[MAX_MAPIN_PAGES]; 365*0Sstevel@tonic-gate long i; 366*0Sstevel@tonic-gate int flags; 367*0Sstevel@tonic-gate size_t size, total = *lenp; 368*0Sstevel@tonic-gate char first = 1; 369*0Sstevel@tonic-gate faultcode_t res; 370*0Sstevel@tonic-gate 371*0Sstevel@tonic-gate *lenp = 0; 372*0Sstevel@tonic-gate if (cow) { 373*0Sstevel@tonic-gate AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 374*0Sstevel@tonic-gate seg = as_findseg(as, uaddr, 0); 375*0Sstevel@tonic-gate if ((seg == NULL) || ((base = seg->s_base) > uaddr) || 376*0Sstevel@tonic-gate (uaddr + total) > base + seg->s_size) { 377*0Sstevel@tonic-gate AS_LOCK_EXIT(as, &as->a_lock); 378*0Sstevel@tonic-gate return (EINVAL); 379*0Sstevel@tonic-gate } 380*0Sstevel@tonic-gate /* 381*0Sstevel@tonic-gate * The COW scheme should work for all segment types. 382*0Sstevel@tonic-gate * But to be safe, we check against segvn. 383*0Sstevel@tonic-gate */ 384*0Sstevel@tonic-gate if (seg->s_ops != &segvn_ops) { 385*0Sstevel@tonic-gate AS_LOCK_EXIT(as, &as->a_lock); 386*0Sstevel@tonic-gate return (ENOTSUP); 387*0Sstevel@tonic-gate } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) { 388*0Sstevel@tonic-gate AS_LOCK_EXIT(as, &as->a_lock); 389*0Sstevel@tonic-gate return (ENOTSUP); 390*0Sstevel@tonic-gate } 391*0Sstevel@tonic-gate } 392*0Sstevel@tonic-gate hat = as->a_hat; 393*0Sstevel@tonic-gate size = total; 394*0Sstevel@tonic-gate tryagain: 395*0Sstevel@tonic-gate /* 396*0Sstevel@tonic-gate * If (cow), hat_softlock will also change the usr protection to RO. 397*0Sstevel@tonic-gate * This is the first step toward setting up cow. Before we 398*0Sstevel@tonic-gate * bump up an_refcnt, we can't allow any cow-fault on this 399*0Sstevel@tonic-gate * address. Otherwise segvn_fault will change the protection back 400*0Sstevel@tonic-gate * to RW upon seeing an_refcnt == 1. 401*0Sstevel@tonic-gate * The solution is to hold the writer lock on "as". 402*0Sstevel@tonic-gate */ 403*0Sstevel@tonic-gate res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0); 404*0Sstevel@tonic-gate size = total - size; 405*0Sstevel@tonic-gate *lenp += size; 406*0Sstevel@tonic-gate size = size >> PAGESHIFT; 407*0Sstevel@tonic-gate i = 0; 408*0Sstevel@tonic-gate while (i < size) { 409*0Sstevel@tonic-gate pp = ppp[i]; 410*0Sstevel@tonic-gate if (cow) { 411*0Sstevel@tonic-gate kmutex_t *ahm; 412*0Sstevel@tonic-gate /* 413*0Sstevel@tonic-gate * Another solution is to hold SE_EXCL on pp, and 414*0Sstevel@tonic-gate * disable PROT_WRITE. This also works for MAP_SHARED 415*0Sstevel@tonic-gate * segment. The disadvantage is that it locks the 416*0Sstevel@tonic-gate * page from being used by anybody else. 417*0Sstevel@tonic-gate */ 418*0Sstevel@tonic-gate ahm = &anonhash_lock[ 419*0Sstevel@tonic-gate AH_LOCK(pp->p_vnode, pp->p_offset)]; 420*0Sstevel@tonic-gate mutex_enter(ahm); 421*0Sstevel@tonic-gate *app = swap_anon(pp->p_vnode, pp->p_offset); 422*0Sstevel@tonic-gate /* 423*0Sstevel@tonic-gate * Since we are holding the as lock, this avoids a 424*0Sstevel@tonic-gate * potential race with anon_decref. (segvn_unmap and 425*0Sstevel@tonic-gate * segvn_free needs the as writer lock to do anon_free.) 426*0Sstevel@tonic-gate */ 427*0Sstevel@tonic-gate if (*app != NULL) { 428*0Sstevel@tonic-gate #if 0 429*0Sstevel@tonic-gate if ((*app)->an_refcnt == 0) 430*0Sstevel@tonic-gate /* 431*0Sstevel@tonic-gate * Consider the following senario (unlikey 432*0Sstevel@tonic-gate * though): 433*0Sstevel@tonic-gate * 1. an_refcnt == 2 434*0Sstevel@tonic-gate * 2. we solftlock the page. 435*0Sstevel@tonic-gate * 3. cow ocurrs on this addr. So a new ap, 436*0Sstevel@tonic-gate * page and mapping is established on addr. 437*0Sstevel@tonic-gate * 4. an_refcnt drops to 1 (segvn_faultpage 438*0Sstevel@tonic-gate * -> anon_decref(oldap)) 439*0Sstevel@tonic-gate * 5. the last ref to ap also drops (from 440*0Sstevel@tonic-gate * another as). It ends up blocked inside 441*0Sstevel@tonic-gate * anon_decref trying to get page's excl lock. 442*0Sstevel@tonic-gate * 6. Later kcfree unlocks the page, call 443*0Sstevel@tonic-gate * anon_decref -> oops, ap is gone already. 444*0Sstevel@tonic-gate * 445*0Sstevel@tonic-gate * Holding as writer lock solves all problems. 446*0Sstevel@tonic-gate */ 447*0Sstevel@tonic-gate *app = NULL; 448*0Sstevel@tonic-gate else 449*0Sstevel@tonic-gate #endif 450*0Sstevel@tonic-gate (*app)->an_refcnt++; 451*0Sstevel@tonic-gate } 452*0Sstevel@tonic-gate mutex_exit(ahm); 453*0Sstevel@tonic-gate } else { 454*0Sstevel@tonic-gate *app = NULL; 455*0Sstevel@tonic-gate } 456*0Sstevel@tonic-gate if (kaddr != (caddr_t)-1) { 457*0Sstevel@tonic-gate if (pp != *cached_ppp) { 458*0Sstevel@tonic-gate if (*cached_ppp == NULL) 459*0Sstevel@tonic-gate flags = HAT_LOAD_LOCK | HAT_NOSYNC | 460*0Sstevel@tonic-gate HAT_LOAD_NOCONSIST; 461*0Sstevel@tonic-gate else 462*0Sstevel@tonic-gate flags = HAT_LOAD_REMAP | 463*0Sstevel@tonic-gate HAT_LOAD_NOCONSIST; 464*0Sstevel@tonic-gate /* 465*0Sstevel@tonic-gate * In order to cache the kernel mapping after 466*0Sstevel@tonic-gate * the user page is unlocked, we call 467*0Sstevel@tonic-gate * hat_devload instead of hat_memload so 468*0Sstevel@tonic-gate * that the kernel mapping we set up here is 469*0Sstevel@tonic-gate * "invisible" to the rest of the world. This 470*0Sstevel@tonic-gate * is not very pretty. But as long as the 471*0Sstevel@tonic-gate * caller bears the responsibility of keeping 472*0Sstevel@tonic-gate * cache consistency, we should be ok - 473*0Sstevel@tonic-gate * HAT_NOCONSIST will get us a uncached 474*0Sstevel@tonic-gate * mapping on VAC. hat_softlock will flush 475*0Sstevel@tonic-gate * a VAC_WRITEBACK cache. Therefore the kaddr 476*0Sstevel@tonic-gate * doesn't have to be of the same vcolor as 477*0Sstevel@tonic-gate * uaddr. 478*0Sstevel@tonic-gate * The alternative is - change hat_devload 479*0Sstevel@tonic-gate * to get a cached mapping. Allocate a kaddr 480*0Sstevel@tonic-gate * with the same vcolor as uaddr. Then 481*0Sstevel@tonic-gate * hat_softlock won't need to flush the VAC. 482*0Sstevel@tonic-gate */ 483*0Sstevel@tonic-gate hat_devload(kas.a_hat, kaddr, PAGESIZE, 484*0Sstevel@tonic-gate page_pptonum(pp), PROT_READ, flags); 485*0Sstevel@tonic-gate *cached_ppp = pp; 486*0Sstevel@tonic-gate } 487*0Sstevel@tonic-gate kaddr += PAGESIZE; 488*0Sstevel@tonic-gate } 489*0Sstevel@tonic-gate cached_ppp++; 490*0Sstevel@tonic-gate app++; 491*0Sstevel@tonic-gate ++i; 492*0Sstevel@tonic-gate } 493*0Sstevel@tonic-gate if (cow) { 494*0Sstevel@tonic-gate AS_LOCK_EXIT(as, &as->a_lock); 495*0Sstevel@tonic-gate } 496*0Sstevel@tonic-gate if (first && res == FC_NOMAP) { 497*0Sstevel@tonic-gate /* 498*0Sstevel@tonic-gate * If the address is not mapped yet, we call as_fault to 499*0Sstevel@tonic-gate * fault the pages in. We could've fallen back to copy and 500*0Sstevel@tonic-gate * let it fault in the pages. But for a mapped file, we 501*0Sstevel@tonic-gate * normally reference each page only once. For zero-copy to 502*0Sstevel@tonic-gate * be of any use, we'd better fall in the page now and try 503*0Sstevel@tonic-gate * again. 504*0Sstevel@tonic-gate */ 505*0Sstevel@tonic-gate first = 0; 506*0Sstevel@tonic-gate size = size << PAGESHIFT; 507*0Sstevel@tonic-gate uaddr += size; 508*0Sstevel@tonic-gate total -= size; 509*0Sstevel@tonic-gate size = total; 510*0Sstevel@tonic-gate res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ); 511*0Sstevel@tonic-gate if (cow) 512*0Sstevel@tonic-gate AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 513*0Sstevel@tonic-gate goto tryagain; 514*0Sstevel@tonic-gate } 515*0Sstevel@tonic-gate switch (res) { 516*0Sstevel@tonic-gate case FC_NOSUPPORT: 517*0Sstevel@tonic-gate return (ENOTSUP); 518*0Sstevel@tonic-gate case FC_PROT: /* Pretend we don't know about it. This will be */ 519*0Sstevel@tonic-gate /* caught by the caller when uiomove fails. */ 520*0Sstevel@tonic-gate case FC_NOMAP: 521*0Sstevel@tonic-gate case FC_OBJERR: 522*0Sstevel@tonic-gate default: 523*0Sstevel@tonic-gate return (0); 524*0Sstevel@tonic-gate } 525*0Sstevel@tonic-gate } 526