xref: /csrg-svn/sys/ufs/ffs/ffs_alloc.c (revision 53244)
1 /*
2  * Copyright (c) 1982, 1986, 1989 Regents of the University of California.
3  * All rights reserved.
4  *
5  * %sccs.include.redist.c%
6  *
7  *	@(#)ffs_alloc.c	7.32 (Berkeley) 04/21/92
8  */
9 
10 #include <sys/param.h>
11 #include <sys/systm.h>
12 #include <sys/buf.h>
13 #include <sys/proc.h>
14 #include <sys/vnode.h>
15 #include <sys/kernel.h>
16 #include <sys/syslog.h>
17 
18 #include <ufs/ufs/quota.h>
19 #include <ufs/ufs/inode.h>
20 
21 #include <ufs/ffs/fs.h>
22 #include <ufs/ffs/ffs_extern.h>
23 
24 extern u_long nextgennumber;
25 
26 static daddr_t	ffs_alloccg __P((struct inode *, int, daddr_t, int));
27 static daddr_t	ffs_alloccgblk __P((struct fs *, struct cg *, daddr_t));
28 static ino_t	ffs_dirpref __P((struct fs *));
29 static daddr_t	ffs_fragextend __P((struct inode *, int, long, int, int));
30 static void	ffs_fserr __P((struct fs *, u_int, char *));
31 static u_long	ffs_hashalloc
32 		    __P((struct inode *, int, long, int, u_long (*)()));
33 static ino_t	ffs_ialloccg __P((struct inode *, int, daddr_t, int));
34 static daddr_t	ffs_mapsearch __P((struct fs *, struct cg *, daddr_t, int));
35 
36 /*
37  * Allocate a block in the file system.
38  *
39  * The size of the requested block is given, which must be some
40  * multiple of fs_fsize and <= fs_bsize.
41  * A preference may be optionally specified. If a preference is given
42  * the following hierarchy is used to allocate a block:
43  *   1) allocate the requested block.
44  *   2) allocate a rotationally optimal block in the same cylinder.
45  *   3) allocate a block in the same cylinder group.
46  *   4) quadradically rehash into other cylinder groups, until an
47  *      available block is located.
48  * If no block preference is given the following heirarchy is used
49  * to allocate a block:
50  *   1) allocate a block in the cylinder group that contains the
51  *      inode for the file.
52  *   2) quadradically rehash into other cylinder groups, until an
53  *      available block is located.
54  */
55 ffs_alloc(ip, lbn, bpref, size, cred, bnp)
56 	register struct inode *ip;
57 	daddr_t lbn, bpref;
58 	int size;
59 	struct ucred *cred;
60 	daddr_t *bnp;
61 {
62 	daddr_t bno;
63 	register struct fs *fs;
64 	register struct buf *bp;
65 	int cg, error;
66 
67 	*bnp = 0;
68 	fs = ip->i_fs;
69 #ifdef DIAGNOSTIC
70 	if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
71 		printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
72 		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
73 		panic("ffs_alloc: bad size");
74 	}
75 	if (cred == NOCRED)
76 		panic("ffs_alloc: missing credential\n");
77 #endif /* DIAGNOSTIC */
78 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
79 		goto nospace;
80 	if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
81 		goto nospace;
82 #ifdef QUOTA
83 	if (error = chkdq(ip, (long)btodb(size), cred, 0))
84 		return (error);
85 #endif
86 	if (bpref >= fs->fs_size)
87 		bpref = 0;
88 	if (bpref == 0)
89 		cg = itog(fs, ip->i_number);
90 	else
91 		cg = dtog(fs, bpref);
92 	bno = (daddr_t)ffs_hashalloc(ip, cg, (long)bpref, size,
93 	    (u_long (*)())ffs_alloccg);
94 	if (bno > 0) {
95 		ip->i_blocks += btodb(size);
96 		ip->i_flag |= IUPD|ICHG;
97 		*bnp = bno;
98 		return (0);
99 	}
100 #ifdef QUOTA
101 	/*
102 	 * Restore user's disk quota because allocation failed.
103 	 */
104 	(void) chkdq(ip, (long)-btodb(size), cred, FORCE);
105 #endif
106 nospace:
107 	ffs_fserr(fs, cred->cr_uid, "file system full");
108 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
109 	return (ENOSPC);
110 }
111 
112 /*
113  * Reallocate a fragment to a bigger size
114  *
115  * The number and size of the old block is given, and a preference
116  * and new size is also specified. The allocator attempts to extend
117  * the original block. Failing that, the regular block allocator is
118  * invoked to get an appropriate block.
119  */
120 ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
121 	register struct inode *ip;
122 	daddr_t lbprev;
123 	daddr_t bpref;
124 	int osize, nsize;
125 	struct ucred *cred;
126 	struct buf **bpp;
127 {
128 	register struct fs *fs;
129 	struct buf *bp, *obp;
130 	int cg, request, error;
131 	daddr_t bprev, bno;
132 
133 	*bpp = 0;
134 	fs = ip->i_fs;
135 #ifdef DIAGNOSTIC
136 	if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
137 	    (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
138 		printf(
139 		    "dev = 0x%x, bsize = %d, osize = %d, nsize = %d, fs = %s\n",
140 		    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
141 		panic("ffs_realloccg: bad size");
142 	}
143 	if (cred == NOCRED)
144 		panic("ffs_realloccg: missing credential\n");
145 #endif /* DIAGNOSTIC */
146 	if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) <= 0)
147 		goto nospace;
148 	if ((bprev = ip->i_db[lbprev]) == 0) {
149 		printf("dev = 0x%x, bsize = %d, bprev = %d, fs = %s\n",
150 		    ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt);
151 		panic("ffs_realloccg: bad bprev");
152 	}
153 	/*
154 	 * Allocate the extra space in the buffer.
155 	 */
156 	if (error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) {
157 		brelse(bp);
158 		return (error);
159 	}
160 #ifdef QUOTA
161 	if (error = chkdq(ip, (long)btodb(nsize - osize), cred, 0)) {
162 		brelse(bp);
163 		return (error);
164 	}
165 #endif
166 	/*
167 	 * Check for extension in the existing location.
168 	 */
169 	cg = dtog(fs, bprev);
170 	if (bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) {
171 		if (bp->b_blkno != fsbtodb(fs, bno))
172 			panic("bad blockno");
173 		ip->i_blocks += btodb(nsize - osize);
174 		ip->i_flag |= IUPD|ICHG;
175 		allocbuf(bp, nsize);
176 		bp->b_flags |= B_DONE;
177 		bzero(bp->b_un.b_addr + osize, (unsigned)nsize - osize);
178 		*bpp = bp;
179 		return (0);
180 	}
181 	/*
182 	 * Allocate a new disk location.
183 	 */
184 	if (bpref >= fs->fs_size)
185 		bpref = 0;
186 	switch ((int)fs->fs_optim) {
187 	case FS_OPTSPACE:
188 		/*
189 		 * Allocate an exact sized fragment. Although this makes
190 		 * best use of space, we will waste time relocating it if
191 		 * the file continues to grow. If the fragmentation is
192 		 * less than half of the minimum free reserve, we choose
193 		 * to begin optimizing for time.
194 		 */
195 		request = nsize;
196 		if (fs->fs_minfree < 5 ||
197 		    fs->fs_cstotal.cs_nffree >
198 		    fs->fs_dsize * fs->fs_minfree / (2 * 100))
199 			break;
200 		log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
201 			fs->fs_fsmnt);
202 		fs->fs_optim = FS_OPTTIME;
203 		break;
204 	case FS_OPTTIME:
205 		/*
206 		 * At this point we have discovered a file that is trying to
207 		 * grow a small fragment to a larger fragment. To save time,
208 		 * we allocate a full sized block, then free the unused portion.
209 		 * If the file continues to grow, the `ffs_fragextend' call
210 		 * above will be able to grow it in place without further
211 		 * copying. If aberrant programs cause disk fragmentation to
212 		 * grow within 2% of the free reserve, we choose to begin
213 		 * optimizing for space.
214 		 */
215 		request = fs->fs_bsize;
216 		if (fs->fs_cstotal.cs_nffree <
217 		    fs->fs_dsize * (fs->fs_minfree - 2) / 100)
218 			break;
219 		log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
220 			fs->fs_fsmnt);
221 		fs->fs_optim = FS_OPTSPACE;
222 		break;
223 	default:
224 		printf("dev = 0x%x, optim = %d, fs = %s\n",
225 		    ip->i_dev, fs->fs_optim, fs->fs_fsmnt);
226 		panic("ffs_realloccg: bad optim");
227 		/* NOTREACHED */
228 	}
229 	bno = (daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request,
230 	    (u_long (*)())ffs_alloccg);
231 	if (bno > 0) {
232 		bp->b_blkno = fsbtodb(fs, bno);
233 		(void) vnode_pager_uncache(ITOV(ip));
234 		ffs_blkfree(ip, bprev, (long)osize);
235 		if (nsize < request)
236 			ffs_blkfree(ip, bno + numfrags(fs, nsize),
237 			    (long)(request - nsize));
238 		ip->i_blocks += btodb(nsize - osize);
239 		ip->i_flag |= IUPD|ICHG;
240 		allocbuf(bp, nsize);
241 		bp->b_flags |= B_DONE;
242 		bzero(bp->b_un.b_addr + osize, (unsigned)nsize - osize);
243 		*bpp = bp;
244 		return (0);
245 	}
246 #ifdef QUOTA
247 	/*
248 	 * Restore user's disk quota because allocation failed.
249 	 */
250 	(void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE);
251 #endif
252 	brelse(bp);
253 nospace:
254 	/*
255 	 * no space available
256 	 */
257 	ffs_fserr(fs, cred->cr_uid, "file system full");
258 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
259 	return (ENOSPC);
260 }
261 
262 /*
263  * Allocate an inode in the file system.
264  *
265  * If allocating a directory, use ffs_dirpref to select the inode.
266  * If allocating in a directory, the following hierarchy is followed:
267  *   1) allocate the preferred inode.
268  *   2) allocate an inode in the same cylinder group.
269  *   3) quadradically rehash into other cylinder groups, until an
270  *      available inode is located.
271  * If no inode preference is given the following heirarchy is used
272  * to allocate an inode:
273  *   1) allocate an inode in cylinder group 0.
274  *   2) quadradically rehash into other cylinder groups, until an
275  *      available inode is located.
276  */
277 ffs_valloc(pvp, mode, cred, vpp)
278 	register struct vnode *pvp;
279 	int mode;
280 	struct ucred *cred;
281 	struct vnode **vpp;
282 {
283 	register struct inode *pip;
284 	register struct fs *fs;
285 	register struct inode *ip;
286 	ino_t ino, ipref;
287 	int cg, error;
288 
289 	*vpp = NULL;
290 	pip = VTOI(pvp);
291 	fs = pip->i_fs;
292 	if (fs->fs_cstotal.cs_nifree == 0)
293 		goto noinodes;
294 
295 	if ((mode & IFMT) == IFDIR)
296 		ipref = ffs_dirpref(fs);
297 	else
298 		ipref = pip->i_number;
299 	if (ipref >= fs->fs_ncg * fs->fs_ipg)
300 		ipref = 0;
301 	cg = itog(fs, ipref);
302 	ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, ffs_ialloccg);
303 	if (ino == 0)
304 		goto noinodes;
305 	error = ffs_vget(pvp->v_mount, ino, vpp);
306 	if (error) {
307 		ffs_vfree(pvp, ino, mode);
308 		return (error);
309 	}
310 	ip = VTOI(*vpp);
311 	if (ip->i_mode) {
312 		printf("mode = 0%o, inum = %d, fs = %s\n",
313 		    ip->i_mode, ip->i_number, fs->fs_fsmnt);
314 		panic("ffs_valloc: dup alloc");
315 	}
316 	if (ip->i_blocks) {				/* XXX */
317 		printf("free inode %s/%d had %d blocks\n",
318 		    fs->fs_fsmnt, ino, ip->i_blocks);
319 		ip->i_blocks = 0;
320 	}
321 	ip->i_flags = 0;
322 	/*
323 	 * Set up a new generation number for this inode.
324 	 */
325 	if (++nextgennumber < (u_long)time.tv_sec)
326 		nextgennumber = time.tv_sec;
327 	ip->i_gen = nextgennumber;
328 	return (0);
329 noinodes:
330 	ffs_fserr(fs, cred->cr_uid, "out of inodes");
331 	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
332 	return (ENOSPC);
333 }
334 
335 /*
336  * Find a cylinder to place a directory.
337  *
338  * The policy implemented by this algorithm is to select from
339  * among those cylinder groups with above the average number of
340  * free inodes, the one with the smallest number of directories.
341  */
342 static ino_t
343 ffs_dirpref(fs)
344 	register struct fs *fs;
345 {
346 	int cg, minndir, mincg, avgifree;
347 
348 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
349 	minndir = fs->fs_ipg;
350 	mincg = 0;
351 	for (cg = 0; cg < fs->fs_ncg; cg++)
352 		if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
353 		    fs->fs_cs(fs, cg).cs_nifree >= avgifree) {
354 			mincg = cg;
355 			minndir = fs->fs_cs(fs, cg).cs_ndir;
356 		}
357 	return ((ino_t)(fs->fs_ipg * mincg));
358 }
359 
360 /*
361  * Select the desired position for the next block in a file.  The file is
362  * logically divided into sections. The first section is composed of the
363  * direct blocks. Each additional section contains fs_maxbpg blocks.
364  *
365  * If no blocks have been allocated in the first section, the policy is to
366  * request a block in the same cylinder group as the inode that describes
367  * the file. If no blocks have been allocated in any other section, the
368  * policy is to place the section in a cylinder group with a greater than
369  * average number of free blocks.  An appropriate cylinder group is found
370  * by using a rotor that sweeps the cylinder groups. When a new group of
371  * blocks is needed, the sweep begins in the cylinder group following the
372  * cylinder group from which the previous allocation was made. The sweep
373  * continues until a cylinder group with greater than the average number
374  * of free blocks is found. If the allocation is for the first block in an
375  * indirect block, the information on the previous allocation is unavailable;
376  * here a best guess is made based upon the logical block number being
377  * allocated.
378  *
379  * If a section is already partially allocated, the policy is to
380  * contiguously allocate fs_maxcontig blocks.  The end of one of these
381  * contiguous blocks and the beginning of the next is physically separated
382  * so that the disk head will be in transit between them for at least
383  * fs_rotdelay milliseconds.  This is to allow time for the processor to
384  * schedule another I/O transfer.
385  */
386 daddr_t
387 ffs_blkpref(ip, lbn, indx, bap)
388 	struct inode *ip;
389 	daddr_t lbn;
390 	int indx;
391 	daddr_t *bap;
392 {
393 	register struct fs *fs;
394 	register int cg;
395 	int avgbfree, startcg;
396 	daddr_t nextblk;
397 
398 	fs = ip->i_fs;
399 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
400 		if (lbn < NDADDR) {
401 			cg = itog(fs, ip->i_number);
402 			return (fs->fs_fpg * cg + fs->fs_frag);
403 		}
404 		/*
405 		 * Find a cylinder with greater than average number of
406 		 * unused data blocks.
407 		 */
408 		if (indx == 0 || bap[indx - 1] == 0)
409 			startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
410 		else
411 			startcg = dtog(fs, bap[indx - 1]) + 1;
412 		startcg %= fs->fs_ncg;
413 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
414 		for (cg = startcg; cg < fs->fs_ncg; cg++)
415 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
416 				fs->fs_cgrotor = cg;
417 				return (fs->fs_fpg * cg + fs->fs_frag);
418 			}
419 		for (cg = 0; cg <= startcg; cg++)
420 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
421 				fs->fs_cgrotor = cg;
422 				return (fs->fs_fpg * cg + fs->fs_frag);
423 			}
424 		return (NULL);
425 	}
426 	/*
427 	 * One or more previous blocks have been laid out. If less
428 	 * than fs_maxcontig previous blocks are contiguous, the
429 	 * next block is requested contiguously, otherwise it is
430 	 * requested rotationally delayed by fs_rotdelay milliseconds.
431 	 */
432 	nextblk = bap[indx - 1] + fs->fs_frag;
433 	if (indx > fs->fs_maxcontig &&
434 	    bap[indx - fs->fs_maxcontig] + blkstofrags(fs, fs->fs_maxcontig)
435 	    != nextblk)
436 		return (nextblk);
437 	if (fs->fs_rotdelay != 0)
438 		/*
439 		 * Here we convert ms of delay to frags as:
440 		 * (frags) = (ms) * (rev/sec) * (sect/rev) /
441 		 *	((sect/frag) * (ms/sec))
442 		 * then round up to the next block.
443 		 */
444 		nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
445 		    (NSPF(fs) * 1000), fs->fs_frag);
446 	return (nextblk);
447 }
448 
449 /*
450  * Implement the cylinder overflow algorithm.
451  *
452  * The policy implemented by this algorithm is:
453  *   1) allocate the block in its requested cylinder group.
454  *   2) quadradically rehash on the cylinder group number.
455  *   3) brute force search for a free block.
456  */
457 /*VARARGS5*/
458 static u_long
459 ffs_hashalloc(ip, cg, pref, size, allocator)
460 	struct inode *ip;
461 	int cg;
462 	long pref;
463 	int size;	/* size for data blocks, mode for inodes */
464 	u_long (*allocator)();
465 {
466 	register struct fs *fs;
467 	long result;
468 	int i, icg = cg;
469 
470 	fs = ip->i_fs;
471 	/*
472 	 * 1: preferred cylinder group
473 	 */
474 	result = (*allocator)(ip, cg, pref, size);
475 	if (result)
476 		return (result);
477 	/*
478 	 * 2: quadratic rehash
479 	 */
480 	for (i = 1; i < fs->fs_ncg; i *= 2) {
481 		cg += i;
482 		if (cg >= fs->fs_ncg)
483 			cg -= fs->fs_ncg;
484 		result = (*allocator)(ip, cg, 0, size);
485 		if (result)
486 			return (result);
487 	}
488 	/*
489 	 * 3: brute force search
490 	 * Note that we start at i == 2, since 0 was checked initially,
491 	 * and 1 is always checked in the quadratic rehash.
492 	 */
493 	cg = (icg + 2) % fs->fs_ncg;
494 	for (i = 2; i < fs->fs_ncg; i++) {
495 		result = (*allocator)(ip, cg, 0, size);
496 		if (result)
497 			return (result);
498 		cg++;
499 		if (cg == fs->fs_ncg)
500 			cg = 0;
501 	}
502 	return (NULL);
503 }
504 
505 /*
506  * Determine whether a fragment can be extended.
507  *
508  * Check to see if the necessary fragments are available, and
509  * if they are, allocate them.
510  */
511 static daddr_t
512 ffs_fragextend(ip, cg, bprev, osize, nsize)
513 	struct inode *ip;
514 	int cg;
515 	long bprev;
516 	int osize, nsize;
517 {
518 	register struct fs *fs;
519 	register struct cg *cgp;
520 	struct buf *bp;
521 	long bno;
522 	int frags, bbase;
523 	int i, error;
524 
525 	fs = ip->i_fs;
526 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
527 		return (NULL);
528 	frags = numfrags(fs, nsize);
529 	bbase = fragnum(fs, bprev);
530 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
531 		/* cannot extend across a block boundary */
532 		return (NULL);
533 	}
534 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
535 		(int)fs->fs_cgsize, NOCRED, &bp);
536 	if (error) {
537 		brelse(bp);
538 		return (NULL);
539 	}
540 	cgp = bp->b_un.b_cg;
541 	if (!cg_chkmagic(cgp)) {
542 		brelse(bp);
543 		return (NULL);
544 	}
545 	cgp->cg_time = time.tv_sec;
546 	bno = dtogd(fs, bprev);
547 	for (i = numfrags(fs, osize); i < frags; i++)
548 		if (isclr(cg_blksfree(cgp), bno + i)) {
549 			brelse(bp);
550 			return (NULL);
551 		}
552 	/*
553 	 * the current fragment can be extended
554 	 * deduct the count on fragment being extended into
555 	 * increase the count on the remaining fragment (if any)
556 	 * allocate the extended piece
557 	 */
558 	for (i = frags; i < fs->fs_frag - bbase; i++)
559 		if (isclr(cg_blksfree(cgp), bno + i))
560 			break;
561 	cgp->cg_frsum[i - numfrags(fs, osize)]--;
562 	if (i != frags)
563 		cgp->cg_frsum[i - frags]++;
564 	for (i = numfrags(fs, osize); i < frags; i++) {
565 		clrbit(cg_blksfree(cgp), bno + i);
566 		cgp->cg_cs.cs_nffree--;
567 		fs->fs_cstotal.cs_nffree--;
568 		fs->fs_cs(fs, cg).cs_nffree--;
569 	}
570 	fs->fs_fmod = 1;
571 	bdwrite(bp);
572 	return (bprev);
573 }
574 
575 /*
576  * Determine whether a block can be allocated.
577  *
578  * Check to see if a block of the apprpriate size is available,
579  * and if it is, allocate it.
580  */
581 static daddr_t
582 ffs_alloccg(ip, cg, bpref, size)
583 	struct inode *ip;
584 	int cg;
585 	daddr_t bpref;
586 	int size;
587 {
588 	register struct fs *fs;
589 	register struct cg *cgp;
590 	struct buf *bp;
591 	register int i;
592 	int error, bno, frags, allocsiz;
593 
594 	fs = ip->i_fs;
595 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
596 		return (NULL);
597 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
598 		(int)fs->fs_cgsize, NOCRED, &bp);
599 	if (error) {
600 		brelse(bp);
601 		return (NULL);
602 	}
603 	cgp = bp->b_un.b_cg;
604 	if (!cg_chkmagic(cgp) ||
605 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
606 		brelse(bp);
607 		return (NULL);
608 	}
609 	cgp->cg_time = time.tv_sec;
610 	if (size == fs->fs_bsize) {
611 		bno = ffs_alloccgblk(fs, cgp, bpref);
612 		bdwrite(bp);
613 		return (bno);
614 	}
615 	/*
616 	 * check to see if any fragments are already available
617 	 * allocsiz is the size which will be allocated, hacking
618 	 * it down to a smaller size if necessary
619 	 */
620 	frags = numfrags(fs, size);
621 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
622 		if (cgp->cg_frsum[allocsiz] != 0)
623 			break;
624 	if (allocsiz == fs->fs_frag) {
625 		/*
626 		 * no fragments were available, so a block will be
627 		 * allocated, and hacked up
628 		 */
629 		if (cgp->cg_cs.cs_nbfree == 0) {
630 			brelse(bp);
631 			return (NULL);
632 		}
633 		bno = ffs_alloccgblk(fs, cgp, bpref);
634 		bpref = dtogd(fs, bno);
635 		for (i = frags; i < fs->fs_frag; i++)
636 			setbit(cg_blksfree(cgp), bpref + i);
637 		i = fs->fs_frag - frags;
638 		cgp->cg_cs.cs_nffree += i;
639 		fs->fs_cstotal.cs_nffree += i;
640 		fs->fs_cs(fs, cg).cs_nffree += i;
641 		fs->fs_fmod = 1;
642 		cgp->cg_frsum[i]++;
643 		bdwrite(bp);
644 		return (bno);
645 	}
646 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
647 	if (bno < 0) {
648 		brelse(bp);
649 		return (NULL);
650 	}
651 	for (i = 0; i < frags; i++)
652 		clrbit(cg_blksfree(cgp), bno + i);
653 	cgp->cg_cs.cs_nffree -= frags;
654 	fs->fs_cstotal.cs_nffree -= frags;
655 	fs->fs_cs(fs, cg).cs_nffree -= frags;
656 	fs->fs_fmod = 1;
657 	cgp->cg_frsum[allocsiz]--;
658 	if (frags != allocsiz)
659 		cgp->cg_frsum[allocsiz - frags]++;
660 	bdwrite(bp);
661 	return (cg * fs->fs_fpg + bno);
662 }
663 
664 /*
665  * Allocate a block in a cylinder group.
666  *
667  * This algorithm implements the following policy:
668  *   1) allocate the requested block.
669  *   2) allocate a rotationally optimal block in the same cylinder.
670  *   3) allocate the next available block on the block rotor for the
671  *      specified cylinder group.
672  * Note that this routine only allocates fs_bsize blocks; these
673  * blocks may be fragmented by the routine that allocates them.
674  */
675 static daddr_t
676 ffs_alloccgblk(fs, cgp, bpref)
677 	register struct fs *fs;
678 	register struct cg *cgp;
679 	daddr_t bpref;
680 {
681 	daddr_t bno;
682 	int cylno, pos, delta;
683 	short *cylbp;
684 	register int i;
685 
686 	if (bpref == 0) {
687 		bpref = cgp->cg_rotor;
688 		goto norot;
689 	}
690 	bpref = blknum(fs, bpref);
691 	bpref = dtogd(fs, bpref);
692 	/*
693 	 * if the requested block is available, use it
694 	 */
695 	if (ffs_isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bpref))) {
696 		bno = bpref;
697 		goto gotit;
698 	}
699 	/*
700 	 * check for a block available on the same cylinder
701 	 */
702 	cylno = cbtocylno(fs, bpref);
703 	if (cg_blktot(cgp)[cylno] == 0)
704 		goto norot;
705 	if (fs->fs_cpc == 0) {
706 		/*
707 		 * block layout info is not available, so just have
708 		 * to take any block in this cylinder.
709 		 */
710 		bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
711 		goto norot;
712 	}
713 	/*
714 	 * check the summary information to see if a block is
715 	 * available in the requested cylinder starting at the
716 	 * requested rotational position and proceeding around.
717 	 */
718 	cylbp = cg_blks(fs, cgp, cylno);
719 	pos = cbtorpos(fs, bpref);
720 	for (i = pos; i < fs->fs_nrpos; i++)
721 		if (cylbp[i] > 0)
722 			break;
723 	if (i == fs->fs_nrpos)
724 		for (i = 0; i < pos; i++)
725 			if (cylbp[i] > 0)
726 				break;
727 	if (cylbp[i] > 0) {
728 		/*
729 		 * found a rotational position, now find the actual
730 		 * block. A panic if none is actually there.
731 		 */
732 		pos = cylno % fs->fs_cpc;
733 		bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
734 		if (fs_postbl(fs, pos)[i] == -1) {
735 			printf("pos = %d, i = %d, fs = %s\n",
736 			    pos, i, fs->fs_fsmnt);
737 			panic("ffs_alloccgblk: cyl groups corrupted");
738 		}
739 		for (i = fs_postbl(fs, pos)[i];; ) {
740 			if (ffs_isblock(fs, cg_blksfree(cgp), bno + i)) {
741 				bno = blkstofrags(fs, (bno + i));
742 				goto gotit;
743 			}
744 			delta = fs_rotbl(fs)[i];
745 			if (delta <= 0 ||
746 			    delta + i > fragstoblks(fs, fs->fs_fpg))
747 				break;
748 			i += delta;
749 		}
750 		printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
751 		panic("ffs_alloccgblk: can't find blk in cyl");
752 	}
753 norot:
754 	/*
755 	 * no blocks in the requested cylinder, so take next
756 	 * available one in this cylinder group.
757 	 */
758 	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
759 	if (bno < 0)
760 		return (NULL);
761 	cgp->cg_rotor = bno;
762 gotit:
763 	ffs_clrblock(fs, cg_blksfree(cgp), (long)fragstoblks(fs, bno));
764 	cgp->cg_cs.cs_nbfree--;
765 	fs->fs_cstotal.cs_nbfree--;
766 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
767 	cylno = cbtocylno(fs, bno);
768 	cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--;
769 	cg_blktot(cgp)[cylno]--;
770 	fs->fs_fmod = 1;
771 	return (cgp->cg_cgx * fs->fs_fpg + bno);
772 }
773 
774 /*
775  * Determine whether an inode can be allocated.
776  *
777  * Check to see if an inode is available, and if it is,
778  * allocate it using the following policy:
779  *   1) allocate the requested inode.
780  *   2) allocate the next available inode after the requested
781  *      inode in the specified cylinder group.
782  */
783 static ino_t
784 ffs_ialloccg(ip, cg, ipref, mode)
785 	struct inode *ip;
786 	int cg;
787 	daddr_t ipref;
788 	int mode;
789 {
790 	register struct fs *fs;
791 	register struct cg *cgp;
792 	struct buf *bp;
793 	int error, start, len, loc, map, i;
794 
795 	fs = ip->i_fs;
796 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
797 		return (NULL);
798 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
799 		(int)fs->fs_cgsize, NOCRED, &bp);
800 	if (error) {
801 		brelse(bp);
802 		return (NULL);
803 	}
804 	cgp = bp->b_un.b_cg;
805 	if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
806 		brelse(bp);
807 		return (NULL);
808 	}
809 	cgp->cg_time = time.tv_sec;
810 	if (ipref) {
811 		ipref %= fs->fs_ipg;
812 		if (isclr(cg_inosused(cgp), ipref))
813 			goto gotit;
814 	}
815 	start = cgp->cg_irotor / NBBY;
816 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
817 	loc = skpc(0xff, len, &cg_inosused(cgp)[start]);
818 	if (loc == 0) {
819 		len = start + 1;
820 		start = 0;
821 		loc = skpc(0xff, len, &cg_inosused(cgp)[0]);
822 		if (loc == 0) {
823 			printf("cg = %s, irotor = %d, fs = %s\n",
824 			    cg, cgp->cg_irotor, fs->fs_fsmnt);
825 			panic("ffs_ialloccg: map corrupted");
826 			/* NOTREACHED */
827 		}
828 	}
829 	i = start + len - loc;
830 	map = cg_inosused(cgp)[i];
831 	ipref = i * NBBY;
832 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
833 		if ((map & i) == 0) {
834 			cgp->cg_irotor = ipref;
835 			goto gotit;
836 		}
837 	}
838 	printf("fs = %s\n", fs->fs_fsmnt);
839 	panic("ffs_ialloccg: block not in map");
840 	/* NOTREACHED */
841 gotit:
842 	setbit(cg_inosused(cgp), ipref);
843 	cgp->cg_cs.cs_nifree--;
844 	fs->fs_cstotal.cs_nifree--;
845 	fs->fs_cs(fs, cg).cs_nifree--;
846 	fs->fs_fmod = 1;
847 	if ((mode & IFMT) == IFDIR) {
848 		cgp->cg_cs.cs_ndir++;
849 		fs->fs_cstotal.cs_ndir++;
850 		fs->fs_cs(fs, cg).cs_ndir++;
851 	}
852 	bdwrite(bp);
853 	return (cg * fs->fs_ipg + ipref);
854 }
855 
856 /*
857  * Free a block or fragment.
858  *
859  * The specified block or fragment is placed back in the
860  * free map. If a fragment is deallocated, a possible
861  * block reassembly is checked.
862  */
863 ffs_blkfree(ip, bno, size)
864 	register struct inode *ip;
865 	daddr_t bno;
866 	long size;
867 {
868 	register struct fs *fs;
869 	register struct cg *cgp;
870 	struct buf *bp;
871 	int error, cg, blk, frags, bbase;
872 	register int i;
873 
874 	fs = ip->i_fs;
875 	if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
876 		printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
877 		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
878 		panic("blkfree: bad size");
879 	}
880 	cg = dtog(fs, bno);
881 	if ((unsigned)bno >= fs->fs_size) {
882 		printf("bad block %d, ino %d\n", bno, ip->i_number);
883 		ffs_fserr(fs, ip->i_uid, "bad block");
884 		return;
885 	}
886 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
887 		(int)fs->fs_cgsize, NOCRED, &bp);
888 	if (error) {
889 		brelse(bp);
890 		return;
891 	}
892 	cgp = bp->b_un.b_cg;
893 	if (!cg_chkmagic(cgp)) {
894 		brelse(bp);
895 		return;
896 	}
897 	cgp->cg_time = time.tv_sec;
898 	bno = dtogd(fs, bno);
899 	if (size == fs->fs_bsize) {
900 		if (ffs_isblock(fs, cg_blksfree(cgp), fragstoblks(fs, bno))) {
901 			printf("dev = 0x%x, block = %d, fs = %s\n",
902 			    ip->i_dev, bno, fs->fs_fsmnt);
903 			panic("blkfree: freeing free block");
904 		}
905 		ffs_setblock(fs, cg_blksfree(cgp), fragstoblks(fs, bno));
906 		cgp->cg_cs.cs_nbfree++;
907 		fs->fs_cstotal.cs_nbfree++;
908 		fs->fs_cs(fs, cg).cs_nbfree++;
909 		i = cbtocylno(fs, bno);
910 		cg_blks(fs, cgp, i)[cbtorpos(fs, bno)]++;
911 		cg_blktot(cgp)[i]++;
912 	} else {
913 		bbase = bno - fragnum(fs, bno);
914 		/*
915 		 * decrement the counts associated with the old frags
916 		 */
917 		blk = blkmap(fs, cg_blksfree(cgp), bbase);
918 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
919 		/*
920 		 * deallocate the fragment
921 		 */
922 		frags = numfrags(fs, size);
923 		for (i = 0; i < frags; i++) {
924 			if (isset(cg_blksfree(cgp), bno + i)) {
925 				printf("dev = 0x%x, block = %d, fs = %s\n",
926 				    ip->i_dev, bno + i, fs->fs_fsmnt);
927 				panic("blkfree: freeing free frag");
928 			}
929 			setbit(cg_blksfree(cgp), bno + i);
930 		}
931 		cgp->cg_cs.cs_nffree += i;
932 		fs->fs_cstotal.cs_nffree += i;
933 		fs->fs_cs(fs, cg).cs_nffree += i;
934 		/*
935 		 * add back in counts associated with the new frags
936 		 */
937 		blk = blkmap(fs, cg_blksfree(cgp), bbase);
938 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
939 		/*
940 		 * if a complete block has been reassembled, account for it
941 		 */
942 		if (ffs_isblock(fs, cg_blksfree(cgp),
943 		    (daddr_t)fragstoblks(fs, bbase))) {
944 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
945 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
946 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
947 			cgp->cg_cs.cs_nbfree++;
948 			fs->fs_cstotal.cs_nbfree++;
949 			fs->fs_cs(fs, cg).cs_nbfree++;
950 			i = cbtocylno(fs, bbase);
951 			cg_blks(fs, cgp, i)[cbtorpos(fs, bbase)]++;
952 			cg_blktot(cgp)[i]++;
953 		}
954 	}
955 	fs->fs_fmod = 1;
956 	bdwrite(bp);
957 }
958 
959 /*
960  * Free an inode.
961  *
962  * The specified inode is placed back in the free map.
963  */
964 void
965 ffs_vfree(pvp, ino, mode)
966 	struct vnode *pvp;
967 	ino_t ino;
968 	int mode;
969 {
970 	register struct fs *fs;
971 	register struct cg *cgp;
972 	register struct inode *pip;
973 	struct buf *bp;
974 	int error, cg;
975 
976 	pip = VTOI(pvp);
977 	fs = pip->i_fs;
978 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
979 		panic("ifree: range: dev = 0x%x, ino = %d, fs = %s\n",
980 		    pip->i_dev, ino, fs->fs_fsmnt);
981 	cg = itog(fs, ino);
982 	error = bread(pip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
983 		(int)fs->fs_cgsize, NOCRED, &bp);
984 	if (error) {
985 		brelse(bp);
986 		return;
987 	}
988 	cgp = bp->b_un.b_cg;
989 	if (!cg_chkmagic(cgp)) {
990 		brelse(bp);
991 		return;
992 	}
993 	cgp->cg_time = time.tv_sec;
994 	ino %= fs->fs_ipg;
995 	if (isclr(cg_inosused(cgp), ino)) {
996 		printf("dev = 0x%x, ino = %d, fs = %s\n",
997 		    pip->i_dev, ino, fs->fs_fsmnt);
998 		if (fs->fs_ronly == 0)
999 			panic("ifree: freeing free inode");
1000 	}
1001 	clrbit(cg_inosused(cgp), ino);
1002 	if (ino < cgp->cg_irotor)
1003 		cgp->cg_irotor = ino;
1004 	cgp->cg_cs.cs_nifree++;
1005 	fs->fs_cstotal.cs_nifree++;
1006 	fs->fs_cs(fs, cg).cs_nifree++;
1007 	if ((mode & IFMT) == IFDIR) {
1008 		cgp->cg_cs.cs_ndir--;
1009 		fs->fs_cstotal.cs_ndir--;
1010 		fs->fs_cs(fs, cg).cs_ndir--;
1011 	}
1012 	fs->fs_fmod = 1;
1013 	bdwrite(bp);
1014 }
1015 
1016 /*
1017  * Find a block of the specified size in the specified cylinder group.
1018  *
1019  * It is a panic if a request is made to find a block if none are
1020  * available.
1021  */
1022 static daddr_t
1023 ffs_mapsearch(fs, cgp, bpref, allocsiz)
1024 	register struct fs *fs;
1025 	register struct cg *cgp;
1026 	daddr_t bpref;
1027 	int allocsiz;
1028 {
1029 	daddr_t bno;
1030 	int start, len, loc, i;
1031 	int blk, field, subfield, pos;
1032 
1033 	/*
1034 	 * find the fragment by searching through the free block
1035 	 * map for an appropriate bit pattern
1036 	 */
1037 	if (bpref)
1038 		start = dtogd(fs, bpref) / NBBY;
1039 	else
1040 		start = cgp->cg_frotor / NBBY;
1041 	len = howmany(fs->fs_fpg, NBBY) - start;
1042 	loc = scanc((unsigned)len, (u_char *)&cg_blksfree(cgp)[start],
1043 		(u_char *)fragtbl[fs->fs_frag],
1044 		(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1045 	if (loc == 0) {
1046 		len = start + 1;
1047 		start = 0;
1048 		loc = scanc((unsigned)len, (u_char *)&cg_blksfree(cgp)[0],
1049 			(u_char *)fragtbl[fs->fs_frag],
1050 			(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
1051 		if (loc == 0) {
1052 			printf("start = %d, len = %d, fs = %s\n",
1053 			    start, len, fs->fs_fsmnt);
1054 			panic("ffs_alloccg: map corrupted");
1055 			/* NOTREACHED */
1056 		}
1057 	}
1058 	bno = (start + len - loc) * NBBY;
1059 	cgp->cg_frotor = bno;
1060 	/*
1061 	 * found the byte in the map
1062 	 * sift through the bits to find the selected frag
1063 	 */
1064 	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
1065 		blk = blkmap(fs, cg_blksfree(cgp), bno);
1066 		blk <<= 1;
1067 		field = around[allocsiz];
1068 		subfield = inside[allocsiz];
1069 		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
1070 			if ((blk & field) == subfield)
1071 				return (bno + pos);
1072 			field <<= 1;
1073 			subfield <<= 1;
1074 		}
1075 	}
1076 	printf("bno = %d, fs = %s\n", bno, fs->fs_fsmnt);
1077 	panic("ffs_alloccg: block not in map");
1078 	return (-1);
1079 }
1080 
1081 /*
1082  * Fserr prints the name of a file system with an error diagnostic.
1083  *
1084  * The form of the error message is:
1085  *	fs: error message
1086  */
1087 static void
1088 ffs_fserr(fs, uid, cp)
1089 	struct fs *fs;
1090 	u_int uid;
1091 	char *cp;
1092 {
1093 
1094 	log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->fs_fsmnt, cp);
1095 }
1096