1 /* 2 * Copyright (c) 2003, 2004 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Hiten Pandya <hmp@backplane.com>. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 */ 35 /* 36 * Copyright (c) 1991 Regents of the University of California. 37 * All rights reserved. 38 * 39 * This code is derived from software contributed to Berkeley by 40 * The Mach Operating System project at Carnegie-Mellon University. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 67 * $DragonFly: src/sys/vm/vm_contig.c,v 1.21 2006/12/28 21:24:02 dillon Exp $ 68 */ 69 70 /* 71 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /* 98 * Contiguous memory allocation API. 99 */ 100 101 #include <sys/param.h> 102 #include <sys/systm.h> 103 #include <sys/malloc.h> 104 #include <sys/proc.h> 105 #include <sys/lock.h> 106 #include <sys/vmmeter.h> 107 #include <sys/vnode.h> 108 109 #include <vm/vm.h> 110 #include <vm/vm_param.h> 111 #include <vm/vm_kern.h> 112 #include <vm/pmap.h> 113 #include <vm/vm_map.h> 114 #include <vm/vm_object.h> 115 #include <vm/vm_page.h> 116 #include <vm/vm_pageout.h> 117 #include <vm/vm_pager.h> 118 #include <vm/vm_extern.h> 119 120 #include <sys/thread2.h> 121 #include <sys/spinlock2.h> 122 #include <vm/vm_page2.h> 123 124 static void vm_contig_pg_free(int start, u_long size); 125 126 /* 127 * vm_contig_pg_clean: 128 * 129 * Do a thorough cleanup of the specified 'queue', which can be either 130 * PQ_ACTIVE or PQ_INACTIVE by doing a walkthrough. If the page is not 131 * marked dirty, it is shoved into the page cache, provided no one has 132 * currently aqcuired it, otherwise localized action per object type 133 * is taken for cleanup: 134 * 135 * In the OBJT_VNODE case, the whole page range is cleaned up 136 * using the vm_object_page_clean() routine, by specyfing a 137 * start and end of '0'. 138 * 139 * Otherwise if the object is of any other type, the generic 140 * pageout (daemon) flush routine is invoked. 141 */ 142 static void 143 vm_contig_pg_clean(int queue, int count) 144 { 145 vm_object_t object; 146 vm_page_t m, m_tmp; 147 struct vm_page marker; 148 struct vpgqueues *pq = &vm_page_queues[queue]; 149 150 /* 151 * Setup a local marker 152 */ 153 bzero(&marker, sizeof(marker)); 154 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 155 marker.queue = queue; 156 marker.wire_count = 1; 157 158 vm_page_queues_spin_lock(queue); 159 TAILQ_INSERT_HEAD(&pq->pl, &marker, pageq); 160 vm_page_queues_spin_unlock(queue); 161 162 /* 163 * Iterate the queue. Note that the vm_page spinlock must be 164 * acquired before the pageq spinlock so it's easiest to simply 165 * not hold it in the loop iteration. 166 */ 167 while (count-- > 0 && (m = TAILQ_NEXT(&marker, pageq)) != NULL) { 168 vm_page_and_queue_spin_lock(m); 169 if (m != TAILQ_NEXT(&marker, pageq)) { 170 vm_page_and_queue_spin_unlock(m); 171 ++count; 172 continue; 173 } 174 KKASSERT(m->queue == queue); 175 176 TAILQ_REMOVE(&pq->pl, &marker, pageq); 177 TAILQ_INSERT_AFTER(&pq->pl, m, &marker, pageq); 178 179 if (m->flags & PG_MARKER) { 180 vm_page_and_queue_spin_unlock(m); 181 continue; 182 } 183 if (vm_page_busy_try(m, TRUE)) { 184 vm_page_and_queue_spin_unlock(m); 185 continue; 186 } 187 vm_page_and_queue_spin_unlock(m); 188 189 /* 190 * We've successfully busied the page 191 */ 192 if (m->queue - m->pc != queue) { 193 vm_page_wakeup(m); 194 continue; 195 } 196 if ((object = m->object) == NULL) { 197 vm_page_wakeup(m); 198 continue; 199 } 200 vm_page_test_dirty(m); 201 if (m->dirty) { 202 vm_object_hold(object); 203 KKASSERT(m->object == object); 204 205 if (object->type == OBJT_VNODE) { 206 vm_page_wakeup(m); 207 vn_lock(object->handle, LK_EXCLUSIVE|LK_RETRY); 208 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 209 vn_unlock(((struct vnode *)object->handle)); 210 } else if (object->type == OBJT_SWAP || 211 object->type == OBJT_DEFAULT) { 212 m_tmp = m; 213 vm_pageout_flush(&m_tmp, 1, 0); 214 } else { 215 vm_page_wakeup(m); 216 } 217 vm_object_drop(object); 218 } else if (m->hold_count == 0) { 219 vm_page_cache(m); 220 } else { 221 vm_page_wakeup(m); 222 } 223 } 224 225 /* 226 * Scrap our local marker 227 */ 228 vm_page_queues_spin_lock(queue); 229 TAILQ_REMOVE(&pq->pl, &marker, pageq); 230 vm_page_queues_spin_unlock(queue); 231 } 232 233 /* 234 * vm_contig_pg_alloc: 235 * 236 * Allocate contiguous pages from the VM. This function does not 237 * map the allocated pages into the kernel map, otherwise it is 238 * impossible to make large allocations (i.e. >2G). 239 * 240 * Malloc()'s data structures have been used for collection of 241 * statistics and for allocations of less than a page. 242 */ 243 static int 244 vm_contig_pg_alloc(unsigned long size, vm_paddr_t low, vm_paddr_t high, 245 unsigned long alignment, unsigned long boundary, int mflags) 246 { 247 int i, q, start, pass; 248 vm_offset_t phys; 249 vm_page_t pga = vm_page_array; 250 vm_page_t m; 251 int pqtype; 252 253 size = round_page(size); 254 if (size == 0) 255 panic("vm_contig_pg_alloc: size must not be 0"); 256 if ((alignment & (alignment - 1)) != 0) 257 panic("vm_contig_pg_alloc: alignment must be a power of 2"); 258 if ((boundary & (boundary - 1)) != 0) 259 panic("vm_contig_pg_alloc: boundary must be a power of 2"); 260 261 start = 0; 262 263 /* 264 * Three passes (0, 1, 2). Each pass scans the VM page list for 265 * free or cached pages. After each pass if the entire scan failed 266 * we attempt to flush inactive pages and reset the start index back 267 * to 0. For passes 1 and 2 we also attempt to flush active pages. 268 */ 269 for (pass = 0; pass < 3; pass++) { 270 /* 271 * Find first page in array that is free, within range, 272 * aligned, and such that the boundary won't be crossed. 273 */ 274 again: 275 for (i = start; i < vmstats.v_page_count; i++) { 276 m = &pga[i]; 277 phys = VM_PAGE_TO_PHYS(m); 278 pqtype = m->queue - m->pc; 279 if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && 280 (phys >= low) && (phys < high) && 281 ((phys & (alignment - 1)) == 0) && 282 (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0) && 283 m->busy == 0 && m->wire_count == 0 && 284 m->hold_count == 0 && (m->flags & PG_BUSY) == 0 285 286 ) { 287 break; 288 } 289 } 290 291 /* 292 * If we cannot find the page in the given range, or we have 293 * crossed the boundary, call the vm_contig_pg_clean() function 294 * for flushing out the queues, and returning it back to 295 * normal state. 296 */ 297 if ((i == vmstats.v_page_count) || 298 ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) { 299 300 /* 301 * Best effort flush of all inactive pages. 302 * This is quite quick, for now stall all 303 * callers, even if they've specified M_NOWAIT. 304 */ 305 for (q = 0; q < PQ_L2_SIZE; ++q) { 306 vm_contig_pg_clean(PQ_INACTIVE + q, 307 vmstats.v_inactive_count); 308 lwkt_yield(); 309 } 310 311 /* 312 * Best effort flush of active pages. 313 * 314 * This is very, very slow. 315 * Only do this if the caller has agreed to M_WAITOK. 316 * 317 * If enough pages are flushed, we may succeed on 318 * next (final) pass, if not the caller, contigmalloc(), 319 * will fail in the index < 0 case. 320 */ 321 if (pass > 0 && (mflags & M_WAITOK)) { 322 for (q = 0; q < PQ_L2_SIZE; ++q) { 323 vm_contig_pg_clean(PQ_ACTIVE + q, 324 vmstats.v_active_count); 325 } 326 lwkt_yield(); 327 } 328 329 /* 330 * We're already too high in the address space 331 * to succeed, reset to 0 for the next iteration. 332 */ 333 start = 0; 334 continue; /* next pass */ 335 } 336 start = i; 337 338 /* 339 * Check successive pages for contiguous and free. 340 * 341 * (still in critical section) 342 */ 343 for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { 344 m = &pga[i]; 345 pqtype = m->queue - m->pc; 346 if ((VM_PAGE_TO_PHYS(&m[0]) != 347 (VM_PAGE_TO_PHYS(&m[-1]) + PAGE_SIZE)) || 348 ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE)) || 349 m->busy || m->wire_count || 350 m->hold_count || (m->flags & PG_BUSY) 351 ) { 352 start++; 353 goto again; 354 } 355 } 356 357 /* 358 * Try to allocate the pages. 359 * 360 * (still in critical section) 361 */ 362 for (i = start; i < (start + size / PAGE_SIZE); i++) { 363 m = &pga[i]; 364 365 if (vm_page_busy_try(m, TRUE)) { 366 vm_contig_pg_free(start, 367 (i - start) * PAGE_SIZE); 368 start++; 369 goto again; 370 } 371 pqtype = m->queue - m->pc; 372 if (pqtype == PQ_CACHE) { 373 vm_page_free(m); 374 --i; 375 continue; /* retry the page */ 376 } 377 if (pqtype != PQ_FREE) { 378 vm_page_wakeup(m); 379 vm_contig_pg_free(start, 380 (i - start) * PAGE_SIZE); 381 start++; 382 goto again; 383 } 384 KKASSERT(m->object == NULL); 385 vm_page_unqueue_nowakeup(m); 386 m->valid = VM_PAGE_BITS_ALL; 387 if (m->flags & PG_ZERO) 388 vm_page_zero_count--; 389 KASSERT(m->dirty == 0, 390 ("vm_contig_pg_alloc: page %p was dirty", m)); 391 KKASSERT(m->wire_count == 0); 392 KKASSERT(m->busy == 0); 393 394 /* 395 * Clear all flags except PG_BUSY, PG_ZERO, and 396 * PG_WANTED, then unbusy the now allocated page. 397 */ 398 vm_page_flag_clear(m, ~(PG_BUSY|PG_ZERO|PG_WANTED)); 399 vm_page_wakeup(m); 400 } 401 402 /* 403 * Our job is done, return the index page of vm_page_array. 404 */ 405 return (start); /* aka &pga[start] */ 406 } 407 408 /* 409 * Failed. 410 */ 411 return (-1); 412 } 413 414 /* 415 * vm_contig_pg_free: 416 * 417 * Remove pages previously allocated by vm_contig_pg_alloc, and 418 * assume all references to the pages have been removed, and that 419 * it is OK to add them back to the free list. 420 * 421 * Caller must ensure no races on the page range in question. 422 * No other requirements. 423 */ 424 static void 425 vm_contig_pg_free(int start, u_long size) 426 { 427 vm_page_t pga = vm_page_array; 428 vm_page_t m; 429 int i; 430 431 size = round_page(size); 432 if (size == 0) 433 panic("vm_contig_pg_free: size must not be 0"); 434 435 for (i = start; i < (start + size / PAGE_SIZE); i++) { 436 m = &pga[i]; 437 vm_page_busy_wait(m, FALSE, "cpgfr"); 438 vm_page_free(m); 439 } 440 } 441 442 /* 443 * vm_contig_pg_kmap: 444 * 445 * Map previously allocated (vm_contig_pg_alloc) range of pages from 446 * vm_page_array[] into the KVA. Once mapped, the pages are part of 447 * the Kernel, and are to free'ed with kmem_free(&kernel_map, addr, size). 448 * 449 * No requirements. 450 */ 451 static vm_offset_t 452 vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags) 453 { 454 vm_offset_t addr, tmp_addr; 455 vm_page_t pga = vm_page_array; 456 int i, count; 457 458 size = round_page(size); 459 if (size == 0) 460 panic("vm_contig_pg_kmap: size must not be 0"); 461 462 /* 463 * We've found a contiguous chunk that meets our requirements. 464 * Allocate KVM, and assign phys pages and return a kernel VM 465 * pointer. 466 */ 467 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 468 vm_map_lock(map); 469 if (vm_map_findspace(map, vm_map_min(map), size, PAGE_SIZE, 0, &addr) != 470 KERN_SUCCESS) { 471 /* 472 * XXX We almost never run out of kernel virtual 473 * space, so we don't make the allocated memory 474 * above available. 475 */ 476 vm_map_unlock(map); 477 vm_map_entry_release(count); 478 return (0); 479 } 480 481 /* 482 * kernel_object maps 1:1 to kernel_map. 483 */ 484 vm_object_hold(&kernel_object); 485 vm_object_reference_locked(&kernel_object); 486 vm_map_insert(map, &count, 487 &kernel_object, addr, 488 addr, addr + size, 489 VM_MAPTYPE_NORMAL, 490 VM_PROT_ALL, VM_PROT_ALL, 491 0); 492 vm_map_unlock(map); 493 vm_map_entry_release(count); 494 495 tmp_addr = addr; 496 for (i = start; i < (start + size / PAGE_SIZE); i++) { 497 vm_page_t m = &pga[i]; 498 if (vm_page_insert(m, &kernel_object, OFF_TO_IDX(tmp_addr)) == 499 FALSE) { 500 panic("vm_contig_pg_kmap: page already exists @%p", 501 (void *)(intptr_t)tmp_addr); 502 } 503 if ((flags & M_ZERO) && !(m->flags & PG_ZERO)) 504 pmap_zero_page(VM_PAGE_TO_PHYS(m)); 505 m->flags = 0; 506 tmp_addr += PAGE_SIZE; 507 } 508 vm_map_wire(map, addr, addr + size, 0); 509 510 vm_object_drop(&kernel_object); 511 512 return (addr); 513 } 514 515 /* 516 * No requirements. 517 */ 518 void * 519 contigmalloc( 520 unsigned long size, /* should be size_t here and for malloc() */ 521 struct malloc_type *type, 522 int flags, 523 vm_paddr_t low, 524 vm_paddr_t high, 525 unsigned long alignment, 526 unsigned long boundary) 527 { 528 return contigmalloc_map(size, type, flags, low, high, alignment, 529 boundary, &kernel_map); 530 } 531 532 /* 533 * No requirements. 534 */ 535 void * 536 contigmalloc_map( 537 unsigned long size, /* should be size_t here and for malloc() */ 538 struct malloc_type *type, 539 int flags, 540 vm_paddr_t low, 541 vm_paddr_t high, 542 unsigned long alignment, 543 unsigned long boundary, 544 vm_map_t map) 545 { 546 int index; 547 void *rv; 548 549 index = vm_contig_pg_alloc(size, low, high, alignment, boundary, flags); 550 if (index < 0) { 551 kprintf("contigmalloc_map: failed size %lu low=%llx " 552 "high=%llx align=%lu boundary=%lu flags=%08x\n", 553 size, (long long)low, (long long)high, 554 alignment, boundary, flags); 555 return NULL; 556 } 557 558 rv = (void *)vm_contig_pg_kmap(index, size, map, flags); 559 if (rv == NULL) 560 vm_contig_pg_free(index, size); 561 562 return rv; 563 } 564 565 /* 566 * No requirements. 567 */ 568 void 569 contigfree(void *addr, unsigned long size, struct malloc_type *type) 570 { 571 kmem_free(&kernel_map, (vm_offset_t)addr, size); 572 } 573 574 /* 575 * No requirements. 576 */ 577 vm_offset_t 578 vm_page_alloc_contig( 579 vm_offset_t size, 580 vm_paddr_t low, 581 vm_paddr_t high, 582 vm_offset_t alignment) 583 { 584 return ((vm_offset_t)contigmalloc_map(size, M_DEVBUF, M_NOWAIT, low, 585 high, alignment, 0ul, &kernel_map)); 586 } 587