ffs_alloc.c source code [src/src/sys/ufs/ffs/ffs_alloc.c]

1	/ $NetBSD: ffs_alloc.c,v 1.154 2016/10/30 15:01:46 christos Exp $ /
2
3	/-*
4	* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5	* All rights reserved.
6	*
7	* This code is derived from software contributed to The NetBSD Foundation
8	* by Wasabi Systems, Inc.
9	*
10	* Redistribution and use in source and binary forms, with or without
11	* modification, are permitted provided that the following conditions
12	* are met:
13	* 1. Redistributions of source code must retain the above copyright
14	* notice, this list of conditions and the following disclaimer.
15	* 2. Redistributions in binary form must reproduce the above copyright
16	* notice, this list of conditions and the following disclaimer in the
17	* documentation and/or other materials provided with the distribution.
18	*
19	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29	* POSSIBILITY OF SUCH DAMAGE.
30	*/
31
32	/*
33	* Copyright (c) 2002 Networks Associates Technology, Inc.
34	* All rights reserved.
35	*
36	* This software was developed for the FreeBSD Project by Marshall
37	* Kirk McKusick and Network Associates Laboratories, the Security
38	* Research Division of Network Associates, Inc. under DARPA/SPAWAR
39	* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
40	* research program
41	*
42	* Copyright (c) 1982, 1986, 1989, 1993
43	* The Regents of the University of California. All rights reserved.
44	*
45	* Redistribution and use in source and binary forms, with or without
46	* modification, are permitted provided that the following conditions
47	* are met:
48	* 1. Redistributions of source code must retain the above copyright
49	* notice, this list of conditions and the following disclaimer.
50	* 2. Redistributions in binary form must reproduce the above copyright
51	* notice, this list of conditions and the following disclaimer in the
52	* documentation and/or other materials provided with the distribution.
53	* 3. Neither the name of the University nor the names of its contributors
54	* may be used to endorse or promote products derived from this software
55	* without specific prior written permission.
56	*
57	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
58	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
59	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
60	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
61	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
62	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
63	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
64	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
65	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
66	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
67	* SUCH DAMAGE.
68	*
69	* @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95
70	*/
71
72	#include <sys/cdefs.h>
73	__KERNEL_RCSID(`0`, "$NetBSD: ffs_alloc.c,v 1.154 2016/10/30 15:01:46 christos Exp $");
74
75	#if defined(_KERNEL_OPT)
76	#include "opt_ffs.h"
77	#include "opt_quota.h"
78	#include "opt_uvm_page_trkown.h"
79	#endif
80
81	#include <sys/param.h>
82	#include <sys/systm.h>
83	#include <sys/buf.h>
84	#include <sys/cprng.h>
85	#include <sys/fstrans.h>
86	#include <sys/kauth.h>
87	#include <sys/kernel.h>
88	#include <sys/mount.h>
89	#include <sys/proc.h>
90	#include <sys/syslog.h>
91	#include <sys/vnode.h>
92	#include <sys/wapbl.h>
93	#include <sys/cprng.h>
94
95	#include <miscfs/specfs/specdev.h>
96	#include <ufs/ufs/quota.h>
97	#include <ufs/ufs/ufsmount.h>
98	#include <ufs/ufs/inode.h>
99	#include <ufs/ufs/ufs_extern.h>
100	#include <ufs/ufs/ufs_bswap.h>
101	#include <ufs/ufs/ufs_wapbl.h>
102
103	#include <ufs/ffs/fs.h>
104	#include <ufs/ffs/ffs_extern.h>
105
106	#ifdef UVM_PAGE_TRKOWN
107	#include <uvm/uvm.h>
108	#endif
109
110	static daddr_t ffs_alloccg(struct inode , int, daddr_t, int, int, int*);
111	static daddr_t ffs_alloccgblk(struct inode , struct* buf , daddr_t, int, int*);
112	static ino_t ffs_dirpref(struct inode *);
113	static daddr_t ffs_fragextend(struct inode , int, daddr_t, int, int*);
114	static void ffs_fserr(struct fs , kauth_cred_t, const* char *);
115	static daddr_t ffs_hashalloc(struct inode , int, daddr_t, int, int, int*,
116	daddr_t ()(struct* inode , int, daddr_t, int, int, int*));
117	static daddr_t ffs_nodealloccg(struct inode , int, daddr_t, int, int, int*);
118	static int32_t ffs_mapsearch(struct fs , struct* cg *,
119	daddr_t, int);
120	static void ffs_blkfree_common(struct ufsmount , struct* fs , dev_t, struct* buf *,
121	daddr_t, long, bool);
122	static void ffs_freefile_common(struct ufsmount , struct* fs , dev_t, struct* buf *, ino_t,
123	int, bool);
124
125	/ if 1, changes in optimalization strategy are logged /
126	int ffs_log_changeopt = `0`;
127
128	/ in ffs_tables.c /
129	extern const int inside[], around[];
130	extern const u_char * const fragtbl[];
131
132	/ Basic consistency check for block allocations /
133	static int
134	ffs_check_bad_allocation(const char func, struct* fs *fs, daddr_t bno,
135	long size, dev_t dev, ino_t inum)
136	{
137	if ((u_int)size > fs->fs_bsize \|\| ffs_fragoff(fs, size) != `0` \|\|
138	ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) {
139	panic("%s: bad size: dev = 0x%llx, bno = %" PRId64
140	" bsize = %d, size = %ld, fs = %s", func,
141	(long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
142	}
143
144	if (bno >= fs->fs_size) {
145	printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno,
146	(unsigned long long)inum);
147	ffs_fserr(fs, NOCRED, "bad block");
148	return EINVAL;
149	}
150	return `0`;
151	}
152
153	/*
154	* Allocate a block in the file system.
155	*
156	* The size of the requested block is given, which must be some
157	* multiple of fs_fsize and <= fs_bsize.
158	* A preference may be optionally specified. If a preference is given
159	* the following hierarchy is used to allocate a block:
160	* 1) allocate the requested block.
161	* 2) allocate a rotationally optimal block in the same cylinder.
162	* 3) allocate a block in the same cylinder group.
163	* 4) quadradically rehash into other cylinder groups, until an
164	* available block is located.
165	* If no block preference is given the following hierarchy is used
166	* to allocate a block:
167	* 1) allocate a block in the cylinder group that contains the
168	* inode for the file.
169	* 2) quadradically rehash into other cylinder groups, until an
170	* available block is located.
171	*
172	* => called with um_lock held
173	* => releases um_lock before returning
174	*/
175	int
176	ffs_alloc(struct inode ip, daddr_t lbn, daddr_t bpref, int* size,
177	int flags, kauth_cred_t cred, daddr_t *bnp)
178	{
179	struct ufsmount *ump;
180	struct fs *fs;
181	daddr_t bno;
182	int cg;
183	#if defined(QUOTA) \|\| defined(QUOTA2)
184	int error;
185	#endif
186
187	fs = ip->i_fs;
188	ump = ip->i_ump;
189
190	KASSERT(mutex_owned(&ump->um_lock));
191
192	#ifdef UVM_PAGE_TRKOWN
193
194	/*
195	* Sanity-check that allocations within the file size
196	* do not allow other threads to read the stale contents
197	* of newly allocated blocks.
198	* Usually pages will exist to cover the new allocation.
199	* There is an optimization in ffs_write() where we skip
200	* creating pages if several conditions are met:
201	* - the file must not be mapped (in any user address space).
202	* - the write must cover whole pages and whole blocks.
203	* If those conditions are not met then pages must exist and
204	* be locked by the current thread.
205	*/
206
207	if (ITOV(ip)->v_type == VREG &&
208	ffs_lblktosize(fs, (voff_t)lbn) < round_page(ITOV(ip)->v_size)) {
209	struct vm_page *pg;
210	struct vnode *vp = ITOV(ip);
211	struct uvm_object *uobj = &vp->v_uobj;
212	voff_t off = trunc_page(ffs_lblktosize(fs, lbn));
213	voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size);
214
215	mutex_enter(uobj->vmobjlock);
216	while (off < endoff) {
217	pg = uvm_pagelookup(uobj, off);
218	KASSERT((pg == NULL && (vp->v_vflag & VV_MAPPED) == `0` &&
219	(size & PAGE_MASK) == `0` &&
220	ffs_blkoff(fs, size) == `0`) \|\|
221	(pg != NULL && pg->owner == curproc->p_pid &&
222	pg->lowner == curlwp->l_lid));
223	off += PAGE_SIZE;
224	}
225	mutex_exit(uobj->vmobjlock);
226	}
227	#endif
228
229	*bnp = `0`;
230	#ifdef DIAGNOSTIC
231	if (cred == NOCRED)
232	panic("%s: missing credential", __func__);
233	if ((u_int)size > fs->fs_bsize \|\| ffs_fragoff(fs, size) != `0`) {
234	panic("%s: bad size: dev = 0x%llx, bsize = %d, size = %d, "
235	"fs = %s", __func__, (unsigned long long)ip->i_dev,
236	fs->fs_bsize, size, fs->fs_fsmnt);
237	}
238	#endif /* DIAGNOSTIC */
239	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == `0`)
240	goto nospace;
241	if (freespace(fs, fs->fs_minfree) <= `0` &&
242	kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, `0`, NULL,
243	NULL, NULL) != `0`)
244	goto nospace;
245	#if defined(QUOTA) \|\| defined(QUOTA2)
246	mutex_exit(&ump->um_lock);
247	if ((error = chkdq(ip, btodb(size), cred, `0`)) != `0`)
248	return (error);
249	mutex_enter(&ump->um_lock);
250	#endif
251
252	if (bpref >= fs->fs_size)
253	bpref = `0`;
254	if (bpref == `0`)
255	cg = ino_to_cg(fs, ip->i_number);
256	else
257	cg = dtog(fs, bpref);
258	bno = ffs_hashalloc(ip, cg, bpref, size, `0`, flags, ffs_alloccg);
259	if (bno > `0`) {
260	DIP_ADD(ip, blocks, btodb(size));
261	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
262	*bnp = bno;
263	return (`0`);
264	}
265	#if defined(QUOTA) \|\| defined(QUOTA2)
266	/*
267	* Restore user's disk quota because allocation failed.
268	*/
269	(void) chkdq(ip, -btodb(size), cred, FORCE);
270	#endif
271	if (flags & B_CONTIG) {
272	/*
273	* XXX ump->um_lock handling is "suspect" at best.
274	* For the case where ffs_hashalloc() fails early
275	* in the B_CONTIG case we reach here with um_lock
276	* already unlocked, so we can't release it again
277	* like in the normal error path. See kern/39206.
278	*
279	*
280	* Fail silently - it's up to our caller to report
281	* errors.
282	*/
283	return (ENOSPC);
284	}
285	nospace:
286	mutex_exit(&ump->um_lock);
287	ffs_fserr(fs, cred, "file system full");
288	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
289	return (ENOSPC);
290	}
291
292	/*
293	* Reallocate a fragment to a bigger size
294	*
295	* The number and size of the old block is given, and a preference
296	* and new size is also specified. The allocator attempts to extend
297	* the original block. Failing that, the regular block allocator is
298	* invoked to get an appropriate block.
299	*
300	* => called with um_lock held
301	* => return with um_lock released
302	*/
303	int
304	ffs_realloccg(struct inode ip, daddr_t lbprev, daddr_t bpref, int* osize,
305	int nsize, kauth_cred_t cred, struct buf *bpp, daddr_t blknop)
306	{
307	struct ufsmount *ump;
308	struct fs *fs;
309	struct buf *bp;
310	int cg, request, error;
311	daddr_t bprev, bno;
312
313	fs = ip->i_fs;
314	ump = ip->i_ump;
315
316	KASSERT(mutex_owned(&ump->um_lock));
317
318	#ifdef UVM_PAGE_TRKOWN
319
320	/*
321	* Sanity-check that allocations within the file size
322	* do not allow other threads to read the stale contents
323	* of newly allocated blocks.
324	* Unlike in ffs_alloc(), here pages must always exist
325	* for such allocations, because only the last block of a file
326	* can be a fragment and ffs_write() will reallocate the
327	* fragment to the new size using ufs_balloc_range(),
328	* which always creates pages to cover blocks it allocates.
329	*/
330
331	if (ITOV(ip)->v_type == VREG) {
332	struct vm_page *pg;
333	struct uvm_object *uobj = &ITOV(ip)->v_uobj;
334	voff_t off = trunc_page(ffs_lblktosize(fs, lbprev));
335	voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize);
336
337	mutex_enter(uobj->vmobjlock);
338	while (off < endoff) {
339	pg = uvm_pagelookup(uobj, off);
340	KASSERT(pg->owner == curproc->p_pid &&
341	pg->lowner == curlwp->l_lid);
342	off += PAGE_SIZE;
343	}
344	mutex_exit(uobj->vmobjlock);
345	}
346	#endif
347
348	#ifdef DIAGNOSTIC
349	if (cred == NOCRED)
350	panic("%s: missing credential", __func__);
351	if ((u_int)osize > fs->fs_bsize \|\| ffs_fragoff(fs, osize) != `0` \|\|
352	(u_int)nsize > fs->fs_bsize \|\| ffs_fragoff(fs, nsize) != `0`) {
353	panic("%s: bad size: dev = 0x%llx, bsize = %d, osize = %d, "
354	"nsize = %d, fs = %s", __func__,
355	(unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
356	fs->fs_fsmnt);
357	}
358	#endif /* DIAGNOSTIC */
359	if (freespace(fs, fs->fs_minfree) <= `0` &&
360	kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, `0`, NULL,
361	NULL, NULL) != `0`) {
362	mutex_exit(&ump->um_lock);
363	goto nospace;
364	}
365	if (fs->fs_magic == FS_UFS2_MAGIC)
366	bprev = ufs_rw64(ip->i_ffs2_db[lbprev], UFS_FSNEEDSWAP(fs));
367	else
368	bprev = ufs_rw32(ip->i_ffs1_db[lbprev], UFS_FSNEEDSWAP(fs));
369
370	if (bprev == `0`) {
371	panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %"
372	PRId64 ", fs = %s", __func__,
373	(unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
374	fs->fs_fsmnt);
375	}
376	mutex_exit(&ump->um_lock);
377
378	/*
379	* Allocate the extra space in the buffer.
380	*/
381	if (bpp != NULL &&
382	(error = bread(ITOV(ip), lbprev, osize, `0`, &bp)) != `0`) {
383	return (error);
384	}
385	#if defined(QUOTA) \|\| defined(QUOTA2)
386	if ((error = chkdq(ip, btodb(nsize - osize), cred, `0`)) != `0`) {
387	if (bpp != NULL) {
388	brelse(bp, `0`);
389	}
390	return (error);
391	}
392	#endif
393	/*
394	* Check for extension in the existing location.
395	*/
396	cg = dtog(fs, bprev);
397	mutex_enter(&ump->um_lock);
398	if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != `0`) {
399	DIP_ADD(ip, blocks, btodb(nsize - osize));
400	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
401
402	if (bpp != NULL) {
403	if (bp->b_blkno != FFS_FSBTODB(fs, bno)) {
404	panic("%s: bad blockno %#llx != %#llx",
405	__func__, (unsigned long long) bp->b_blkno,
406	(unsigned long long)FFS_FSBTODB(fs, bno));
407	}
408	allocbuf(bp, nsize, `1`);
409	memset((char *)bp->b_data + osize, `0`, nsize - osize);
410	mutex_enter(bp->b_objlock);
411	KASSERT(!cv_has_waiters(&bp->b_done));
412	bp->b_oflags \|= BO_DONE;
413	mutex_exit(bp->b_objlock);
414	*bpp = bp;
415	}
416	if (blknop != NULL) {
417	*blknop = bno;
418	}
419	return (`0`);
420	}
421	/*
422	* Allocate a new disk location.
423	*/
424	if (bpref >= fs->fs_size)
425	bpref = `0`;
426	switch ((int)fs->fs_optim) {
427	case FS_OPTSPACE:
428	/*
429	* Allocate an exact sized fragment. Although this makes
430	* best use of space, we will waste time relocating it if
431	* the file continues to grow. If the fragmentation is
432	* less than half of the minimum free reserve, we choose
433	* to begin optimizing for time.
434	*/
435	request = nsize;
436	if (fs->fs_minfree < `5` \|\|
437	fs->fs_cstotal.cs_nffree >
438	fs->fs_dsize * fs->fs_minfree / (`2` * `100`))
439	break;
440
441	if (ffs_log_changeopt) {
442	log(LOG_NOTICE,
443	"%s: optimization changed from SPACE to TIME\n",
444	fs->fs_fsmnt);
445	}
446
447	fs->fs_optim = FS_OPTTIME;
448	break;
449	case FS_OPTTIME:
450	/*
451	* At this point we have discovered a file that is trying to
452	* grow a small fragment to a larger fragment. To save time,
453	* we allocate a full sized block, then free the unused portion.
454	* If the file continues to grow, the `ffs_fragextend' call
455	* above will be able to grow it in place without further
456	* copying. If aberrant programs cause disk fragmentation to
457	* grow within 2% of the free reserve, we choose to begin
458	* optimizing for space.
459	*/
460	request = fs->fs_bsize;
461	if (fs->fs_cstotal.cs_nffree <
462	fs->fs_dsize * (fs->fs_minfree - `2`) / `100`)
463	break;
464
465	if (ffs_log_changeopt) {
466	log(LOG_NOTICE,
467	"%s: optimization changed from TIME to SPACE\n",
468	fs->fs_fsmnt);
469	}
470
471	fs->fs_optim = FS_OPTSPACE;
472	break;
473	default:
474	panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s",
475	__func__, (unsigned long long)ip->i_dev, fs->fs_optim,
476	fs->fs_fsmnt);
477	/ NOTREACHED /
478	}
479	bno = ffs_hashalloc(ip, cg, bpref, request, nsize, `0`, ffs_alloccg);
480	if (bno > `0`) {
481	/*
482	* Use forced deallocation registration, we can't handle
483	* failure here. This is safe, as this place is ever hit
484	* maximum once per write operation, when fragment is extended
485	* to longer fragment, or a full block.
486	*/
487	if ((ip->i_ump->um_mountp->mnt_wapbl) &&
488	(ITOV(ip)->v_type != VREG)) {
489	/ this should never fail /
490	error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(
491	ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev),
492	osize);
493	if (error)
494	panic("ffs_realloccg: dealloc registration failed");
495	} else {
496	ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
497	ip->i_number);
498	}
499	DIP_ADD(ip, blocks, btodb(nsize - osize));
500	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
501	if (bpp != NULL) {
502	bp->b_blkno = FFS_FSBTODB(fs, bno);
503	allocbuf(bp, nsize, `1`);
504	memset((char *)bp->b_data + osize, `0`, (u_int)nsize - osize);
505	mutex_enter(bp->b_objlock);
506	KASSERT(!cv_has_waiters(&bp->b_done));
507	bp->b_oflags \|= BO_DONE;
508	mutex_exit(bp->b_objlock);
509	*bpp = bp;
510	}
511	if (blknop != NULL) {
512	*blknop = bno;
513	}
514	return (`0`);
515	}
516	mutex_exit(&ump->um_lock);
517
518	#if defined(QUOTA) \|\| defined(QUOTA2)
519	/*
520	* Restore user's disk quota because allocation failed.
521	*/
522	(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
523	#endif
524	if (bpp != NULL) {
525	brelse(bp, `0`);
526	}
527
528	nospace:
529	/*
530	* no space available
531	*/
532	ffs_fserr(fs, cred, "file system full");
533	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
534	return (ENOSPC);
535	}
536
537	/*
538	* Allocate an inode in the file system.
539	*
540	* If allocating a directory, use ffs_dirpref to select the inode.
541	* If allocating in a directory, the following hierarchy is followed:
542	* 1) allocate the preferred inode.
543	* 2) allocate an inode in the same cylinder group.
544	* 3) quadradically rehash into other cylinder groups, until an
545	* available inode is located.
546	* If no inode preference is given the following hierarchy is used
547	* to allocate an inode:
548	* 1) allocate an inode in cylinder group 0.
549	* 2) quadradically rehash into other cylinder groups, until an
550	* available inode is located.
551	*
552	* => um_lock not held upon entry or return
553	*/
554	int
555	ffs_valloc(struct vnode pvp, int* mode, kauth_cred_t cred, ino_t *inop)
556	{
557	struct ufsmount *ump;
558	struct inode *pip;
559	struct fs *fs;
560	ino_t ino, ipref;
561	int cg, error;
562
563	UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
564
565	pip = VTOI(pvp);
566	fs = pip->i_fs;
567	ump = pip->i_ump;
568
569	error = UFS_WAPBL_BEGIN(pvp->v_mount);
570	if (error) {
571	return error;
572	}
573	mutex_enter(&ump->um_lock);
574	if (fs->fs_cstotal.cs_nifree == `0`)
575	goto noinodes;
576
577	if ((mode & IFMT) == IFDIR)
578	ipref = ffs_dirpref(pip);
579	else
580	ipref = pip->i_number;
581	if (ipref >= fs->fs_ncg * fs->fs_ipg)
582	ipref = `0`;
583	cg = ino_to_cg(fs, ipref);
584	/*
585	* Track number of dirs created one after another
586	* in a same cg without intervening by files.
587	*/
588	if ((mode & IFMT) == IFDIR) {
589	if (fs->fs_contigdirs[cg] < `255`)
590	fs->fs_contigdirs[cg]++;
591	} else {
592	if (fs->fs_contigdirs[cg] > `0`)
593	fs->fs_contigdirs[cg]--;
594	}
595	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, `0`, `0`, ffs_nodealloccg);
596	if (ino == `0`)
597	goto noinodes;
598	UFS_WAPBL_END(pvp->v_mount);
599	*inop = ino;
600	return `0`;
601
602	noinodes:
603	mutex_exit(&ump->um_lock);
604	UFS_WAPBL_END(pvp->v_mount);
605	ffs_fserr(fs, cred, "out of inodes");
606	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
607	return ENOSPC;
608	}
609
610	/*
611	* Find a cylinder group in which to place a directory.
612	*
613	* The policy implemented by this algorithm is to allocate a
614	* directory inode in the same cylinder group as its parent
615	* directory, but also to reserve space for its files inodes
616	* and data. Restrict the number of directories which may be
617	* allocated one after another in the same cylinder group
618	* without intervening allocation of files.
619	*
620	* If we allocate a first level directory then force allocation
621	* in another cylinder group.
622	*/
623	static ino_t
624	ffs_dirpref(struct inode *pip)
625	{
626	register struct fs *fs;
627	int cg, prefcg;
628	int64_t dirsize, cgsize, curdsz;
629	int avgifree, avgbfree, avgndir;
630	int minifree, minbfree, maxndir;
631	int mincg, minndir;
632	int maxcontigdirs;
633
634	KASSERT(mutex_owned(&pip->i_ump->um_lock));
635
636	fs = pip->i_fs;
637
638	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
639	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
640	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
641
642	/*
643	* Force allocation in another cg if creating a first level dir.
644	*/
645	if (ITOV(pip)->v_vflag & VV_ROOT) {
646	prefcg = cprng_fast32() % fs->fs_ncg;
647	mincg = prefcg;
648	minndir = fs->fs_ipg;
649	for (cg = prefcg; cg < fs->fs_ncg; cg++)
650	if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
651	fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
652	fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
653	mincg = cg;
654	minndir = fs->fs_cs(fs, cg).cs_ndir;
655	}
656	for (cg = `0`; cg < prefcg; cg++)
657	if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
658	fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
659	fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
660	mincg = cg;
661	minndir = fs->fs_cs(fs, cg).cs_ndir;
662	}
663	return ((ino_t)(fs->fs_ipg * mincg));
664	}
665
666	/*
667	* Count various limits which used for
668	* optimal allocation of a directory inode.
669	* Try cylinder groups with >75% avgifree and avgbfree.
670	* Avoid cylinder groups with no free blocks or inodes as that
671	* triggers an I/O-expensive cylinder group scan.
672	*/
673	maxndir = min(avgndir + fs->fs_ipg / `16`, fs->fs_ipg);
674	minifree = avgifree - avgifree / `4`;
675	if (minifree < `1`)
676	minifree = `1`;
677	minbfree = avgbfree - avgbfree / `4`;
678	if (minbfree < `1`)
679	minbfree = `1`;
680	cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
681	dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
682	if (avgndir != `0`) {
683	curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
684	if (dirsize < curdsz)
685	dirsize = curdsz;
686	}
687	if (cgsize < dirsize * `255`)
688	maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize;
689	else
690	maxcontigdirs = `255`;
691	if (fs->fs_avgfpdir > `0`)
692	maxcontigdirs = min(maxcontigdirs,
693	fs->fs_ipg / fs->fs_avgfpdir);
694	if (maxcontigdirs == `0`)
695	maxcontigdirs = `1`;
696
697	/*
698	* Limit number of dirs in one cg and reserve space for
699	* regular files, but only if we have no deficit in
700	* inodes or space.
701	*/
702	prefcg = ino_to_cg(fs, pip->i_number);
703	for (cg = prefcg; cg < fs->fs_ncg; cg++)
704	if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
705	fs->fs_cs(fs, cg).cs_nifree >= minifree &&
706	fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
707	if (fs->fs_contigdirs[cg] < maxcontigdirs)
708	return ((ino_t)(fs->fs_ipg * cg));
709	}
710	for (cg = `0`; cg < prefcg; cg++)
711	if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
712	fs->fs_cs(fs, cg).cs_nifree >= minifree &&
713	fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
714	if (fs->fs_contigdirs[cg] < maxcontigdirs)
715	return ((ino_t)(fs->fs_ipg * cg));
716	}
717	/*
718	* This is a backstop when we are deficient in space.
719	*/
720	for (cg = prefcg; cg < fs->fs_ncg; cg++)
721	if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
722	return ((ino_t)(fs->fs_ipg * cg));
723	for (cg = `0`; cg < prefcg; cg++)
724	if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
725	break;
726	return ((ino_t)(fs->fs_ipg * cg));
727	}
728
729	/*
730	* Select the desired position for the next block in a file. The file is
731	* logically divided into sections. The first section is composed of the
732	* direct blocks. Each additional section contains fs_maxbpg blocks.
733	*
734	* If no blocks have been allocated in the first section, the policy is to
735	* request a block in the same cylinder group as the inode that describes
736	* the file. If no blocks have been allocated in any other section, the
737	* policy is to place the section in a cylinder group with a greater than
738	* average number of free blocks. An appropriate cylinder group is found
739	* by using a rotor that sweeps the cylinder groups. When a new group of
740	* blocks is needed, the sweep begins in the cylinder group following the
741	* cylinder group from which the previous allocation was made. The sweep
742	* continues until a cylinder group with greater than the average number
743	* of free blocks is found. If the allocation is for the first block in an
744	* indirect block, the information on the previous allocation is unavailable;
745	* here a best guess is made based upon the logical block number being
746	* allocated.
747	*
748	* If a section is already partially allocated, the policy is to
749	* contiguously allocate fs_maxcontig blocks. The end of one of these
750	* contiguous blocks and the beginning of the next is laid out
751	* contigously if possible.
752	*
753	* => um_lock held on entry and exit
754	*/
755	daddr_t
756	ffs_blkpref_ufs1(struct inode ip, daddr_t lbn, int* indx, int flags,
757	int32_t bap /* XXX ondisk32 /)
758	{
759	struct fs *fs;
760	int cg;
761	int avgbfree, startcg;
762
763	KASSERT(mutex_owned(&ip->i_ump->um_lock));
764
765	fs = ip->i_fs;
766
767	/*
768	* If allocating a contiguous file with B_CONTIG, use the hints
769	* in the inode extentions to return the desired block.
770	*
771	* For metadata (indirect blocks) return the address of where
772	* the first indirect block resides - we'll scan for the next
773	* available slot if we need to allocate more than one indirect
774	* block. For data, return the address of the actual block
775	* relative to the address of the first data block.
776	*/
777	if (flags & B_CONTIG) {
778	KASSERT(ip->i_ffs_first_data_blk != `0`);
779	KASSERT(ip->i_ffs_first_indir_blk != `0`);
780	if (flags & B_METAONLY)
781	return ip->i_ffs_first_indir_blk;
782	else
783	return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
784	}
785
786	if (indx % fs->fs_maxbpg == `0` \|\| bap[indx - `1`] == `0`) {
787	if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
788	cg = ino_to_cg(fs, ip->i_number);
789	return (cgbase(fs, cg) + fs->fs_frag);
790	}
791	/*
792	* Find a cylinder with greater than average number of
793	* unused data blocks.
794	*/
795	if (indx == `0` \|\| bap[indx - `1`] == `0`)
796	startcg =
797	ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
798	else
799	startcg = dtog(fs,
800	ufs_rw32(bap[indx - `1`], UFS_FSNEEDSWAP(fs)) + `1`);
801	startcg %= fs->fs_ncg;
802	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
803	for (cg = startcg; cg < fs->fs_ncg; cg++)
804	if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
805	return (cgbase(fs, cg) + fs->fs_frag);
806	}
807	for (cg = `0`; cg < startcg; cg++)
808	if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
809	return (cgbase(fs, cg) + fs->fs_frag);
810	}
811	return (`0`);
812	}
813	/*
814	* We just always try to lay things out contiguously.
815	*/
816	return ufs_rw32(bap[indx - `1`], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
817	}
818
819	daddr_t
820	ffs_blkpref_ufs2(struct inode ip, daddr_t lbn, int* indx, int flags,
821	int64_t *bap)
822	{
823	struct fs *fs;
824	int cg;
825	int avgbfree, startcg;
826
827	KASSERT(mutex_owned(&ip->i_ump->um_lock));
828
829	fs = ip->i_fs;
830
831	/*
832	* If allocating a contiguous file with B_CONTIG, use the hints
833	* in the inode extentions to return the desired block.
834	*
835	* For metadata (indirect blocks) return the address of where
836	* the first indirect block resides - we'll scan for the next
837	* available slot if we need to allocate more than one indirect
838	* block. For data, return the address of the actual block
839	* relative to the address of the first data block.
840	*/
841	if (flags & B_CONTIG) {
842	KASSERT(ip->i_ffs_first_data_blk != `0`);
843	KASSERT(ip->i_ffs_first_indir_blk != `0`);
844	if (flags & B_METAONLY)
845	return ip->i_ffs_first_indir_blk;
846	else
847	return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
848	}
849
850	if (indx % fs->fs_maxbpg == `0` \|\| bap[indx - `1`] == `0`) {
851	if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
852	cg = ino_to_cg(fs, ip->i_number);
853	return (cgbase(fs, cg) + fs->fs_frag);
854	}
855	/*
856	* Find a cylinder with greater than average number of
857	* unused data blocks.
858	*/
859	if (indx == `0` \|\| bap[indx - `1`] == `0`)
860	startcg =
861	ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
862	else
863	startcg = dtog(fs,
864	ufs_rw64(bap[indx - `1`], UFS_FSNEEDSWAP(fs)) + `1`);
865	startcg %= fs->fs_ncg;
866	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
867	for (cg = startcg; cg < fs->fs_ncg; cg++)
868	if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
869	return (cgbase(fs, cg) + fs->fs_frag);
870	}
871	for (cg = `0`; cg < startcg; cg++)
872	if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
873	return (cgbase(fs, cg) + fs->fs_frag);
874	}
875	return (`0`);
876	}
877	/*
878	* We just always try to lay things out contiguously.
879	*/
880	return ufs_rw64(bap[indx - `1`], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
881	}
882
883
884	/*
885	* Implement the cylinder overflow algorithm.
886	*
887	* The policy implemented by this algorithm is:
888	* 1) allocate the block in its requested cylinder group.
889	* 2) quadradically rehash on the cylinder group number.
890	* 3) brute force search for a free block.
891	*
892	* => called with um_lock held
893	* => returns with um_lock released on success, held on failure
894	* (*allocator releases lock on success, retains lock on failure)
895	*/
896	/VARARGS5/
897	static daddr_t
898	ffs_hashalloc(struct inode ip, int* cg, daddr_t pref,
899	int size / size for data blocks, mode for inodes /,
900	int realsize,
901	int flags,
902	daddr_t (allocator)(struct* inode , int, daddr_t, int, int, int*))
903	{
904	struct fs *fs;
905	daddr_t result;
906	int i, icg = cg;
907
908	fs = ip->i_fs;
909	/*
910	* 1: preferred cylinder group
911	*/
912	result = (*allocator)(ip, cg, pref, size, realsize, flags);
913	if (result)
914	return (result);
915
916	if (flags & B_CONTIG)
917	return (result);
918	/*
919	* 2: quadratic rehash
920	*/
921	for (i = `1`; i < fs->fs_ncg; i *= `2`) {
922	cg += i;
923	if (cg >= fs->fs_ncg)
924	cg -= fs->fs_ncg;
925	result = (*allocator)(ip, cg, `0`, size, realsize, flags);
926	if (result)
927	return (result);
928	}
929	/*
930	* 3: brute force search
931	* Note that we start at i == 2, since 0 was checked initially,
932	* and 1 is always checked in the quadratic rehash.
933	*/
934	cg = (icg + `2`) % fs->fs_ncg;
935	for (i = `2`; i < fs->fs_ncg; i++) {
936	result = (*allocator)(ip, cg, `0`, size, realsize, flags);
937	if (result)
938	return (result);
939	cg++;
940	if (cg == fs->fs_ncg)
941	cg = `0`;
942	}
943	return (`0`);
944	}
945
946	/*
947	* Determine whether a fragment can be extended.
948	*
949	* Check to see if the necessary fragments are available, and
950	* if they are, allocate them.
951	*
952	* => called with um_lock held
953	* => returns with um_lock released on success, held on failure
954	*/
955	static daddr_t
956	ffs_fragextend(struct inode ip, int* cg, daddr_t bprev, int osize, int nsize)
957	{
958	struct ufsmount *ump;
959	struct fs *fs;
960	struct cg *cgp;
961	struct buf *bp;
962	daddr_t bno;
963	int frags, bbase;
964	int i, error;
965	u_int8_t *blksfree;
966
967	fs = ip->i_fs;
968	ump = ip->i_ump;
969
970	KASSERT(mutex_owned(&ump->um_lock));
971
972	if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize))
973	return (`0`);
974	frags = ffs_numfrags(fs, nsize);
975	bbase = ffs_fragnum(fs, bprev);
976	if (bbase > ffs_fragnum(fs, (bprev + frags - `1`))) {
977	/ cannot extend across a block boundary /
978	return (`0`);
979	}
980	mutex_exit(&ump->um_lock);
981	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
982	(int)fs->fs_cgsize, B_MODIFY, &bp);
983	if (error)
984	goto fail;
985	cgp = (struct cg *)bp->b_data;
986	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
987	goto fail;
988	cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs));
989	if ((fs->fs_magic != FS_UFS1_MAGIC) \|\|
990	(fs->fs_old_flags & FS_FLAGS_UPDATED))
991	cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
992	bno = dtogd(fs, bprev);
993	blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs));
994	for (i = ffs_numfrags(fs, osize); i < frags; i++)
995	if (isclr(blksfree, bno + i))
996	goto fail;
997	/*
998	* the current fragment can be extended
999	* deduct the count on fragment being extended into
1000	* increase the count on the remaining fragment (if any)
1001	* allocate the extended piece
1002	*/
1003	for (i = frags; i < fs->fs_frag - bbase; i++)
1004	if (isclr(blksfree, bno + i))
1005	break;
1006	ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -`1`, UFS_FSNEEDSWAP(fs));
1007	if (i != frags)
1008	ufs_add32(cgp->cg_frsum[i - frags], `1`, UFS_FSNEEDSWAP(fs));
1009	mutex_enter(&ump->um_lock);
1010	for (i = ffs_numfrags(fs, osize); i < frags; i++) {
1011	clrbit(blksfree, bno + i);
1012	ufs_add32(cgp->cg_cs.cs_nffree, -`1`, UFS_FSNEEDSWAP(fs));
1013	fs->fs_cstotal.cs_nffree--;
1014	fs->fs_cs(fs, cg).cs_nffree--;
1015	}
1016	fs->fs_fmod = `1`;
1017	ACTIVECG_CLR(fs, cg);
1018	mutex_exit(&ump->um_lock);
1019	bdwrite(bp);
1020	return (bprev);
1021
1022	fail:
1023	if (bp != NULL)
1024	brelse(bp, `0`);
1025	mutex_enter(&ump->um_lock);
1026	return (`0`);
1027	}
1028
1029	/*
1030	* Determine whether a block can be allocated.
1031	*
1032	* Check to see if a block of the appropriate size is available,
1033	* and if it is, allocate it.
1034	*/
1035	static daddr_t
1036	ffs_alloccg(struct inode ip, int* cg, daddr_t bpref, int size, int realsize,
1037	int flags)
1038	{
1039	struct ufsmount *ump;
1040	struct fs *fs = ip->i_fs;
1041	struct cg *cgp;
1042	struct buf *bp;
1043	int32_t bno;
1044	daddr_t blkno;
1045	int error, frags, allocsiz, i;
1046	u_int8_t *blksfree;
1047	const int needswap = UFS_FSNEEDSWAP(fs);
1048
1049	ump = ip->i_ump;
1050
1051	KASSERT(mutex_owned(&ump->um_lock));
1052
1053	if (fs->fs_cs(fs, cg).cs_nbfree == `0` && size == fs->fs_bsize)
1054	return (`0`);
1055	mutex_exit(&ump->um_lock);
1056	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1057	(int)fs->fs_cgsize, B_MODIFY, &bp);
1058	if (error)
1059	goto fail;
1060	cgp = (struct cg *)bp->b_data;
1061	if (!cg_chkmagic(cgp, needswap) \|\|
1062	(cgp->cg_cs.cs_nbfree == `0` && size == fs->fs_bsize))
1063	goto fail;
1064	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1065	if ((fs->fs_magic != FS_UFS1_MAGIC) \|\|
1066	(fs->fs_old_flags & FS_FLAGS_UPDATED))
1067	cgp->cg_time = ufs_rw64(time_second, needswap);
1068	if (size == fs->fs_bsize) {
1069	mutex_enter(&ump->um_lock);
1070	blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
1071	ACTIVECG_CLR(fs, cg);
1072	mutex_exit(&ump->um_lock);
1073
1074	/*
1075	* If actually needed size is lower, free the extra blocks now.
1076	* This is safe to call here, there is no outside reference
1077	* to this block yet. It is not necessary to keep um_lock
1078	* locked.
1079	*/
1080	if (realsize != `0` && realsize < size) {
1081	ffs_blkfree_common(ip->i_ump, ip->i_fs,
1082	ip->i_devvp->v_rdev,
1083	bp, blkno + ffs_numfrags(fs, realsize),
1084	(long)(size - realsize), false);
1085	}
1086
1087	bdwrite(bp);
1088	return (blkno);
1089	}
1090	/*
1091	* check to see if any fragments are already available
1092	* allocsiz is the size which will be allocated, hacking
1093	* it down to a smaller size if necessary
1094	*/
1095	blksfree = cg_blksfree(cgp, needswap);
1096	frags = ffs_numfrags(fs, size);
1097	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
1098	if (cgp->cg_frsum[allocsiz] != `0`)
1099	break;
1100	if (allocsiz == fs->fs_frag) {
1101	/*
1102	* no fragments were available, so a block will be
1103	* allocated, and hacked up
1104	*/
1105	if (cgp->cg_cs.cs_nbfree == `0`)
1106	goto fail;
1107	mutex_enter(&ump->um_lock);
1108	blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
1109	bno = dtogd(fs, blkno);
1110	for (i = frags; i < fs->fs_frag; i++)
1111	setbit(blksfree, bno + i);
1112	i = fs->fs_frag - frags;
1113	ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
1114	fs->fs_cstotal.cs_nffree += i;
1115	fs->fs_cs(fs, cg).cs_nffree += i;
1116	fs->fs_fmod = `1`;
1117	ufs_add32(cgp->cg_frsum[i], `1`, needswap);
1118	ACTIVECG_CLR(fs, cg);
1119	mutex_exit(&ump->um_lock);
1120	bdwrite(bp);
1121	return (blkno);
1122	}
1123	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
1124	#if 0
1125	/*
1126	* XXX fvdl mapsearch will panic, and never return -1
1127	* also: returning NULL as daddr_t ?
1128	*/
1129	if (bno < `0`)
1130	goto fail;
1131	#endif
1132	for (i = `0`; i < frags; i++)
1133	clrbit(blksfree, bno + i);
1134	mutex_enter(&ump->um_lock);
1135	ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
1136	fs->fs_cstotal.cs_nffree -= frags;
1137	fs->fs_cs(fs, cg).cs_nffree -= frags;
1138	fs->fs_fmod = `1`;
1139	ufs_add32(cgp->cg_frsum[allocsiz], -`1`, needswap);
1140	if (frags != allocsiz)
1141	ufs_add32(cgp->cg_frsum[allocsiz - frags], `1`, needswap);
1142	blkno = cgbase(fs, cg) + bno;
1143	ACTIVECG_CLR(fs, cg);
1144	mutex_exit(&ump->um_lock);
1145	bdwrite(bp);
1146	return blkno;
1147
1148	fail:
1149	if (bp != NULL)
1150	brelse(bp, `0`);
1151	mutex_enter(&ump->um_lock);
1152	return (`0`);
1153	}
1154
1155	/*
1156	* Allocate a block in a cylinder group.
1157	*
1158	* This algorithm implements the following policy:
1159	* 1) allocate the requested block.
1160	* 2) allocate a rotationally optimal block in the same cylinder.
1161	* 3) allocate the next available block on the block rotor for the
1162	* specified cylinder group.
1163	* Note that this routine only allocates fs_bsize blocks; these
1164	* blocks may be fragmented by the routine that allocates them.
1165	*/
1166	static daddr_t
1167	ffs_alloccgblk(struct inode ip, struct* buf bp, daddr_t bpref, int* realsize,
1168	int flags)
1169	{
1170	struct fs *fs = ip->i_fs;
1171	struct cg *cgp;
1172	int cg;
1173	daddr_t blkno;
1174	int32_t bno;
1175	u_int8_t *blksfree;
1176	const int needswap = UFS_FSNEEDSWAP(fs);
1177
1178	KASSERT(mutex_owned(&ip->i_ump->um_lock));
1179
1180	cgp = (struct cg *)bp->b_data;
1181	blksfree = cg_blksfree(cgp, needswap);
1182	if (bpref == `0` \|\| dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) {
1183	bpref = ufs_rw32(cgp->cg_rotor, needswap);
1184	} else {
1185	bpref = ffs_blknum(fs, bpref);
1186	bno = dtogd(fs, bpref);
1187	/*
1188	* if the requested block is available, use it
1189	*/
1190	if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno)))
1191	goto gotit;
1192	/*
1193	* if the requested data block isn't available and we are
1194	* trying to allocate a contiguous file, return an error.
1195	*/
1196	if ((flags & (B_CONTIG \| B_METAONLY)) == B_CONTIG)
1197	return (`0`);
1198	}
1199
1200	/*
1201	* Take the next available block in this cylinder group.
1202	*/
1203	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
1204	#if 0
1205	/*
1206	* XXX jdolecek ffs_mapsearch() succeeds or panics
1207	*/
1208	if (bno < `0`)
1209	return (`0`);
1210	#endif
1211	cgp->cg_rotor = ufs_rw32(bno, needswap);
1212	gotit:
1213	blkno = ffs_fragstoblks(fs, bno);
1214	ffs_clrblock(fs, blksfree, blkno);
1215	ffs_clusteracct(fs, cgp, blkno, -`1`);
1216	ufs_add32(cgp->cg_cs.cs_nbfree, -`1`, needswap);
1217	fs->fs_cstotal.cs_nbfree--;
1218	fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--;
1219	if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1220	((fs->fs_old_flags & FS_FLAGS_UPDATED) == `0`)) {
1221	int cylno;
1222	cylno = old_cbtocylno(fs, bno);
1223	KASSERT(cylno >= `0`);
1224	KASSERT(cylno < fs->fs_old_ncyl);
1225	KASSERT(old_cbtorpos(fs, bno) >= `0`);
1226	KASSERT(fs->fs_old_nrpos == `0` \|\| old_cbtorpos(fs, bno) < fs->fs_old_nrpos);
1227	ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -`1`,
1228	needswap);
1229	ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -`1`, needswap);
1230	}
1231	fs->fs_fmod = `1`;
1232	cg = ufs_rw32(cgp->cg_cgx, needswap);
1233	blkno = cgbase(fs, cg) + bno;
1234	return (blkno);
1235	}
1236
1237	/*
1238	* Determine whether an inode can be allocated.
1239	*
1240	* Check to see if an inode is available, and if it is,
1241	* allocate it using the following policy:
1242	* 1) allocate the requested inode.
1243	* 2) allocate the next available inode after the requested
1244	* inode in the specified cylinder group.
1245	*/
1246	static daddr_t
1247	ffs_nodealloccg(struct inode ip, int* cg, daddr_t ipref, int mode, int realsize,
1248	int flags)
1249	{
1250	struct ufsmount *ump = ip->i_ump;
1251	struct fs *fs = ip->i_fs;
1252	struct cg *cgp;
1253	struct buf bp, ibp;
1254	u_int8_t *inosused;
1255	int error, start, len, loc, map, i;
1256	int32_t initediblk;
1257	daddr_t nalloc;
1258	struct ufs2_dinode *dp2;
1259	const int needswap = UFS_FSNEEDSWAP(fs);
1260
1261	KASSERT(mutex_owned(&ump->um_lock));
1262	UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);
1263
1264	if (fs->fs_cs(fs, cg).cs_nifree == `0`)
1265	return (`0`);
1266	mutex_exit(&ump->um_lock);
1267	ibp = NULL;
1268	initediblk = -`1`;
1269	retry:
1270	error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1271	(int)fs->fs_cgsize, B_MODIFY, &bp);
1272	if (error)
1273	goto fail;
1274	cgp = (struct cg *)bp->b_data;
1275	if (!cg_chkmagic(cgp, needswap) \|\| cgp->cg_cs.cs_nifree == `0`)
1276	goto fail;
1277
1278	if (ibp != NULL &&
1279	initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
1280	/ Another thread allocated more inodes so we retry the test. /
1281	brelse(ibp, `0`);
1282	ibp = NULL;
1283	}
1284	/*
1285	* Check to see if we need to initialize more inodes.
1286	*/
1287	if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) {
1288	initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
1289	nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
1290	if (nalloc + FFS_INOPB(fs) > initediblk &&
1291	initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
1292	/*
1293	* We have to release the cg buffer here to prevent
1294	* a deadlock when reading the inode block will
1295	* run a copy-on-write that might use this cg.
1296	*/
1297	brelse(bp, `0`);
1298	bp = NULL;
1299	error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs,
1300	ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
1301	FFS_NOBLK, fs->fs_bsize, false, &ibp);
1302	if (error)
1303	goto fail;
1304	goto retry;
1305	}
1306	}
1307
1308	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1309	if ((fs->fs_magic != FS_UFS1_MAGIC) \|\|
1310	(fs->fs_old_flags & FS_FLAGS_UPDATED))
1311	cgp->cg_time = ufs_rw64(time_second, needswap);
1312	inosused = cg_inosused(cgp, needswap);
1313	if (ipref) {
1314	ipref %= fs->fs_ipg;
1315	if (isclr(inosused, ipref))
1316	goto gotit;
1317	}
1318	start = ufs_rw32(cgp->cg_irotor, needswap) / NBBY;
1319	len = howmany(fs->fs_ipg - ufs_rw32(cgp->cg_irotor, needswap),
1320	NBBY);
1321	loc = skpc(`0xff`, len, &inosused[start]);
1322	if (loc == `0`) {
1323	len = start + `1`;
1324	start = `0`;
1325	loc = skpc(`0xff`, len, &inosused[`0`]);
1326	if (loc == `0`) {
1327	panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s",
1328	__func__, cg, ufs_rw32(cgp->cg_irotor, needswap),
1329	fs->fs_fsmnt);
1330	/ NOTREACHED /
1331	}
1332	}
1333	i = start + len - loc;
1334	map = inosused[i] ^ `0xff`;
1335	if (map == `0`) {
1336	panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt);
1337	}
1338	ipref = i * NBBY + ffs(map) - `1`;
1339	cgp->cg_irotor = ufs_rw32(ipref, needswap);
1340	gotit:
1341	UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
1342	mode);
1343	/*
1344	* Check to see if we need to initialize more inodes.
1345	*/
1346	if (ibp != NULL) {
1347	KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
1348	memset(ibp->b_data, `0`, fs->fs_bsize);
1349	dp2 = (struct ufs2_dinode *)(ibp->b_data);
1350	for (i = `0`; i < FFS_INOPB(fs); i++) {
1351	/*
1352	* Don't bother to swap, it's supposed to be
1353	* random, after all.
1354	*/
1355	dp2->di_gen = (cprng_fast32() & INT32_MAX) / `2` + `1`;
1356	dp2++;
1357	}
1358	initediblk += FFS_INOPB(fs);
1359	cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
1360	}
1361
1362	mutex_enter(&ump->um_lock);
1363	ACTIVECG_CLR(fs, cg);
1364	setbit(inosused, ipref);
1365	ufs_add32(cgp->cg_cs.cs_nifree, -`1`, needswap);
1366	fs->fs_cstotal.cs_nifree--;
1367	fs->fs_cs(fs, cg).cs_nifree--;
1368	fs->fs_fmod = `1`;
1369	if ((mode & IFMT) == IFDIR) {
1370	ufs_add32(cgp->cg_cs.cs_ndir, `1`, needswap);
1371	fs->fs_cstotal.cs_ndir++;
1372	fs->fs_cs(fs, cg).cs_ndir++;
1373	}
1374	mutex_exit(&ump->um_lock);
1375	if (ibp != NULL) {
1376	bwrite(bp);
1377	bawrite(ibp);
1378	} else
1379	bdwrite(bp);
1380	return (cg * fs->fs_ipg + ipref);
1381	fail:
1382	if (bp != NULL)
1383	brelse(bp, `0`);
1384	if (ibp != NULL)
1385	brelse(ibp, `0`);
1386	mutex_enter(&ump->um_lock);
1387	return (`0`);
1388	}
1389
1390	/*
1391	* Allocate a block or fragment.
1392	*
1393	* The specified block or fragment is removed from the
1394	* free map, possibly fragmenting a block in the process.
1395	*
1396	* This implementation should mirror fs_blkfree
1397	*
1398	* => um_lock not held on entry or exit
1399	*/
1400	int
1401	ffs_blkalloc(struct inode ip, daddr_t bno, long* size)
1402	{
1403	int error;
1404
1405	error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
1406	ip->i_dev, ip->i_uid);
1407	if (error)
1408	return error;
1409
1410	return ffs_blkalloc_ump(ip->i_ump, bno, size);
1411	}
1412
1413	int
1414	ffs_blkalloc_ump(struct ufsmount ump, daddr_t bno, long* size)
1415	{
1416	struct fs *fs = ump->um_fs;
1417	struct cg *cgp;
1418	struct buf *bp;
1419	int32_t fragno, cgbno;
1420	int i, error, cg, blk, frags, bbase;
1421	u_int8_t *blksfree;
1422	const int needswap = UFS_FSNEEDSWAP(fs);
1423
1424	KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == `0` &&
1425	ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag);
1426	KASSERT(bno < fs->fs_size);
1427
1428	cg = dtog(fs, bno);
1429	error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
1430	(int)fs->fs_cgsize, B_MODIFY, &bp);
1431	if (error) {
1432	return error;
1433	}
1434	cgp = (struct cg *)bp->b_data;
1435	if (!cg_chkmagic(cgp, needswap)) {
1436	brelse(bp, `0`);
1437	return EIO;
1438	}
1439	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1440	cgp->cg_time = ufs_rw64(time_second, needswap);
1441	cgbno = dtogd(fs, bno);
1442	blksfree = cg_blksfree(cgp, needswap);
1443
1444	mutex_enter(&ump->um_lock);
1445	if (size == fs->fs_bsize) {
1446	fragno = ffs_fragstoblks(fs, cgbno);
1447	if (!ffs_isblock(fs, blksfree, fragno)) {
1448	mutex_exit(&ump->um_lock);
1449	brelse(bp, `0`);
1450	return EBUSY;
1451	}
1452	ffs_clrblock(fs, blksfree, fragno);
1453	ffs_clusteracct(fs, cgp, fragno, -`1`);
1454	ufs_add32(cgp->cg_cs.cs_nbfree, -`1`, needswap);
1455	fs->fs_cstotal.cs_nbfree--;
1456	fs->fs_cs(fs, cg).cs_nbfree--;
1457	} else {
1458	bbase = cgbno - ffs_fragnum(fs, cgbno);
1459
1460	frags = ffs_numfrags(fs, size);
1461	for (i = `0`; i < frags; i++) {
1462	if (isclr(blksfree, cgbno + i)) {
1463	mutex_exit(&ump->um_lock);
1464	brelse(bp, `0`);
1465	return EBUSY;
1466	}
1467	}
1468	/*
1469	* if a complete block is being split, account for it
1470	*/
1471	fragno = ffs_fragstoblks(fs, bbase);
1472	if (ffs_isblock(fs, blksfree, fragno)) {
1473	ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
1474	fs->fs_cstotal.cs_nffree += fs->fs_frag;
1475	fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
1476	ffs_clusteracct(fs, cgp, fragno, -`1`);
1477	ufs_add32(cgp->cg_cs.cs_nbfree, -`1`, needswap);
1478	fs->fs_cstotal.cs_nbfree--;
1479	fs->fs_cs(fs, cg).cs_nbfree--;
1480	}
1481	/*
1482	* decrement the counts associated with the old frags
1483	*/
1484	blk = blkmap(fs, blksfree, bbase);
1485	ffs_fragacct(fs, blk, cgp->cg_frsum, -`1`, needswap);
1486	/*
1487	* allocate the fragment
1488	*/
1489	for (i = `0`; i < frags; i++) {
1490	clrbit(blksfree, cgbno + i);
1491	}
1492	ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
1493	fs->fs_cstotal.cs_nffree -= i;
1494	fs->fs_cs(fs, cg).cs_nffree -= i;
1495	/*
1496	* add back in counts associated with the new frags
1497	*/
1498	blk = blkmap(fs, blksfree, bbase);
1499	ffs_fragacct(fs, blk, cgp->cg_frsum, `1`, needswap);
1500	}
1501	fs->fs_fmod = `1`;
1502	ACTIVECG_CLR(fs, cg);
1503	mutex_exit(&ump->um_lock);
1504	bdwrite(bp);
1505	return `0`;
1506	}
1507
1508	/*
1509	* Free a block or fragment.
1510	*
1511	* The specified block or fragment is placed back in the
1512	* free map. If a fragment is deallocated, a possible
1513	* block reassembly is checked.
1514	*
1515	* => um_lock not held on entry or exit
1516	*/
1517	static void
1518	ffs_blkfree_cg(struct fs fs, struct* vnode devvp, daddr_t bno, long* size)
1519	{
1520	struct cg *cgp;
1521	struct buf *bp;
1522	struct ufsmount *ump;
1523	daddr_t cgblkno;
1524	int error, cg;
1525	dev_t dev;
1526	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
1527	const int needswap = UFS_FSNEEDSWAP(fs);
1528
1529	KASSERT(!devvp_is_snapshot);
1530
1531	cg = dtog(fs, bno);
1532	dev = devvp->v_rdev;
1533	ump = VFSTOUFS(spec_node_getmountedfs(devvp));
1534	KASSERT(fs == ump->um_fs);
1535	cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg));
1536
1537	error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
1538	B_MODIFY, &bp);
1539	if (error) {
1540	return;
1541	}
1542	cgp = (struct cg *)bp->b_data;
1543	if (!cg_chkmagic(cgp, needswap)) {
1544	brelse(bp, `0`);
1545	return;
1546	}
1547
1548	ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
1549
1550	bdwrite(bp);
1551	}
1552
1553	struct discardopdata {
1554	struct work wk; / must be first /
1555	struct vnode *devvp;
1556	daddr_t bno;
1557	long size;
1558	};
1559
1560	struct discarddata {
1561	struct fs *fs;
1562	struct discardopdata *entry;
1563	long maxsize;
1564	kmutex_t entrylk;
1565	struct workqueue *wq;
1566	int wqcnt, wqdraining;
1567	kmutex_t wqlk;
1568	kcondvar_t wqcv;
1569	/ timer for flush? /
1570	};
1571
1572	static void
1573	ffs_blkfree_td(struct fs fs, struct* discardopdata *td)
1574	{
1575	struct mount *mp = spec_node_getmountedfs(td->devvp);
1576	long todo;
1577	int error;
1578
1579	while (td->size) {
1580	todo = min(td->size,
1581	ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno))));
1582	error = UFS_WAPBL_BEGIN(mp);
1583	if (error) {
1584	printf("ffs: failed to begin wapbl transaction"
1585	" for discard: %d\n", error);
1586	break;
1587	}
1588	ffs_blkfree_cg(fs, td->devvp, td->bno, todo);
1589	UFS_WAPBL_END(mp);
1590	td->bno += ffs_numfrags(fs, todo);
1591	td->size -= todo;
1592	}
1593	}
1594
1595	static void
1596	ffs_discardcb(struct work wk, void* *arg)
1597	{
1598	struct discardopdata td = (void* *)wk;
1599	struct discarddata *ts = arg;
1600	struct fs *fs = ts->fs;
1601	off_t start, len;
1602	#ifdef TRIMDEBUG
1603	int error;
1604	#endif
1605
1606	/ like FSBTODB but emits bytes; XXX move to fs.h /
1607	#ifndef FFS_FSBTOBYTES
1608	#define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift)
1609	#endif
1610
1611	start = FFS_FSBTOBYTES(fs, td->bno);
1612	len = td->size;
1613	#ifdef TRIMDEBUG
1614	error =
1615	#endif
1616	VOP_FDISCARD(td->devvp, start, len);
1617	#ifdef TRIMDEBUG
1618	printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error);
1619	#endif
1620
1621	ffs_blkfree_td(fs, td);
1622	kmem_free(td, sizeof(*td));
1623	mutex_enter(&ts->wqlk);
1624	ts->wqcnt--;
1625	if (ts->wqdraining && !ts->wqcnt)
1626	cv_signal(&ts->wqcv);
1627	mutex_exit(&ts->wqlk);
1628	}
1629
1630	void *
1631	ffs_discard_init(struct vnode devvp, struct* fs *fs)
1632	{
1633	struct discarddata *ts;
1634	int error;
1635
1636	ts = kmem_zalloc(sizeof (*ts), KM_SLEEP);
1637	error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts,
1638	`0`, `0`, `0`);
1639	if (error) {
1640	kmem_free(ts, sizeof (*ts));
1641	return NULL;
1642	}
1643	mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE);
1644	mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE);
1645	cv_init(&ts->wqcv, "trimwqcv");
1646	ts->maxsize = `100``1024`; /* XXX /
1647	ts->fs = fs;
1648	return ts;
1649	}
1650
1651	void
1652	ffs_discard_finish(void vts, int* flags)
1653	{
1654	struct discarddata *ts = vts;
1655	struct discardopdata *td = NULL;
1656	int res = `0`;
1657
1658	/ wait for workqueue to drain /
1659	mutex_enter(&ts->wqlk);
1660	if (ts->wqcnt) {
1661	ts->wqdraining = `1`;
1662	res = cv_timedwait(&ts->wqcv, &ts->wqlk, mstohz(`5000`));
1663	}
1664	mutex_exit(&ts->wqlk);
1665	if (res)
1666	printf("ffs_discarddata drain timeout\n");
1667
1668	mutex_enter(&ts->entrylk);
1669	if (ts->entry) {
1670	td = ts->entry;
1671	ts->entry = NULL;
1672	}
1673	mutex_exit(&ts->entrylk);
1674	if (td) {
1675	/ XXX don't tell disk, its optional /
1676	ffs_blkfree_td(ts->fs, td);
1677	#ifdef TRIMDEBUG
1678	printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size);
1679	#endif
1680	kmem_free(td, sizeof(*td));
1681	}
1682
1683	cv_destroy(&ts->wqcv);
1684	mutex_destroy(&ts->entrylk);
1685	mutex_destroy(&ts->wqlk);
1686	workqueue_destroy(ts->wq);
1687	kmem_free(ts, sizeof(*ts));
1688	}
1689
1690	void
1691	ffs_blkfree(struct fs fs, struct* vnode devvp, daddr_t bno, long* size,
1692	ino_t inum)
1693	{
1694	struct ufsmount *ump;
1695	int error;
1696	dev_t dev;
1697	struct discarddata *ts;
1698	struct discardopdata *td;
1699
1700	dev = devvp->v_rdev;
1701	ump = VFSTOUFS(spec_node_getmountedfs(devvp));
1702	if (ffs_snapblkfree(fs, devvp, bno, size, inum))
1703	return;
1704
1705	error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
1706	if (error)
1707	return;
1708
1709	if (!ump->um_discarddata) {
1710	ffs_blkfree_cg(fs, devvp, bno, size);
1711	return;
1712	}
1713
1714	#ifdef TRIMDEBUG
1715	printf("blkfree(%" PRId64 ",%ld)\n", bno, size);
1716	#endif
1717	ts = ump->um_discarddata;
1718	td = NULL;
1719
1720	mutex_enter(&ts->entrylk);
1721	if (ts->entry) {
1722	td = ts->entry;
1723	/ ffs deallocs backwards, check for prepend only /
1724	if (td->bno == bno + ffs_numfrags(fs, size)
1725	&& td->size + size <= ts->maxsize) {
1726	td->bno = bno;
1727	td->size += size;
1728	if (td->size < ts->maxsize) {
1729	#ifdef TRIMDEBUG
1730	printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
1731	#endif
1732	mutex_exit(&ts->entrylk);
1733	return;
1734	}
1735	size = `0`; / mark done /
1736	}
1737	ts->entry = NULL;
1738	}
1739	mutex_exit(&ts->entrylk);
1740
1741	if (td) {
1742	#ifdef TRIMDEBUG
1743	printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size);
1744	#endif
1745	mutex_enter(&ts->wqlk);
1746	ts->wqcnt++;
1747	mutex_exit(&ts->wqlk);
1748	workqueue_enqueue(ts->wq, &td->wk, NULL);
1749	}
1750	if (!size)
1751	return;
1752
1753	td = kmem_alloc(sizeof(*td), KM_SLEEP);
1754	td->devvp = devvp;
1755	td->bno = bno;
1756	td->size = size;
1757
1758	if (td->size < ts->maxsize) { / XXX always the case /
1759	mutex_enter(&ts->entrylk);
1760	if (!ts->entry) { / possible race? /
1761	#ifdef TRIMDEBUG
1762	printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
1763	#endif
1764	ts->entry = td;
1765	td = NULL;
1766	}
1767	mutex_exit(&ts->entrylk);
1768	}
1769	if (td) {
1770	#ifdef TRIMDEBUG
1771	printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size);
1772	#endif
1773	mutex_enter(&ts->wqlk);
1774	ts->wqcnt++;
1775	mutex_exit(&ts->wqlk);
1776	workqueue_enqueue(ts->wq, &td->wk, NULL);
1777	}
1778	}
1779
1780	/*
1781	* Free a block or fragment from a snapshot cg copy.
1782	*
1783	* The specified block or fragment is placed back in the
1784	* free map. If a fragment is deallocated, a possible
1785	* block reassembly is checked.
1786	*
1787	* => um_lock not held on entry or exit
1788	*/
1789	void
1790	ffs_blkfree_snap(struct fs fs, struct* vnode devvp, daddr_t bno, long* size,
1791	ino_t inum)
1792	{
1793	struct cg *cgp;
1794	struct buf *bp;
1795	struct ufsmount *ump;
1796	daddr_t cgblkno;
1797	int error, cg;
1798	dev_t dev;
1799	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
1800	const int needswap = UFS_FSNEEDSWAP(fs);
1801
1802	KASSERT(devvp_is_snapshot);
1803
1804	cg = dtog(fs, bno);
1805	dev = VTOI(devvp)->i_devvp->v_rdev;
1806	ump = VFSTOUFS(devvp->v_mount);
1807	cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg));
1808
1809	error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
1810	if (error)
1811	return;
1812
1813	error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
1814	B_MODIFY, &bp);
1815	if (error) {
1816	return;
1817	}
1818	cgp = (struct cg *)bp->b_data;
1819	if (!cg_chkmagic(cgp, needswap)) {
1820	brelse(bp, `0`);
1821	return;
1822	}
1823
1824	ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
1825
1826	bdwrite(bp);
1827	}
1828
1829	static void
1830	ffs_blkfree_common(struct ufsmount ump, struct* fs *fs, dev_t dev,
1831	struct buf bp, daddr_t bno, long* size, bool devvp_is_snapshot)
1832	{
1833	struct cg *cgp;
1834	int32_t fragno, cgbno;
1835	int i, cg, blk, frags, bbase;
1836	u_int8_t *blksfree;
1837	const int needswap = UFS_FSNEEDSWAP(fs);
1838
1839	cg = dtog(fs, bno);
1840	cgp = (struct cg *)bp->b_data;
1841	cgp->cg_old_time = ufs_rw32(time_second, needswap);
1842	if ((fs->fs_magic != FS_UFS1_MAGIC) \|\|
1843	(fs->fs_old_flags & FS_FLAGS_UPDATED))
1844	cgp->cg_time = ufs_rw64(time_second, needswap);
1845	cgbno = dtogd(fs, bno);
1846	blksfree = cg_blksfree(cgp, needswap);
1847	mutex_enter(&ump->um_lock);
1848	if (size == fs->fs_bsize) {
1849	fragno = ffs_fragstoblks(fs, cgbno);
1850	if (!ffs_isfreeblock(fs, blksfree, fragno)) {
1851	if (devvp_is_snapshot) {
1852	mutex_exit(&ump->um_lock);
1853	return;
1854	}
1855	panic("%s: freeing free block: dev = 0x%llx, block = %"
1856	PRId64 ", fs = %s", __func__,
1857	(unsigned long long)dev, bno, fs->fs_fsmnt);
1858	}
1859	ffs_setblock(fs, blksfree, fragno);
1860	ffs_clusteracct(fs, cgp, fragno, `1`);
1861	ufs_add32(cgp->cg_cs.cs_nbfree, `1`, needswap);
1862	fs->fs_cstotal.cs_nbfree++;
1863	fs->fs_cs(fs, cg).cs_nbfree++;
1864	if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1865	((fs->fs_old_flags & FS_FLAGS_UPDATED) == `0`)) {
1866	i = old_cbtocylno(fs, cgbno);
1867	KASSERT(i >= `0`);
1868	KASSERT(i < fs->fs_old_ncyl);
1869	KASSERT(old_cbtorpos(fs, cgbno) >= `0`);
1870	KASSERT(fs->fs_old_nrpos == `0` \|\| old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos);
1871	ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], `1`,
1872	needswap);
1873	ufs_add32(old_cg_blktot(cgp, needswap)[i], `1`, needswap);
1874	}
1875	} else {
1876	bbase = cgbno - ffs_fragnum(fs, cgbno);
1877	/*
1878	* decrement the counts associated with the old frags
1879	*/
1880	blk = blkmap(fs, blksfree, bbase);
1881	ffs_fragacct(fs, blk, cgp->cg_frsum, -`1`, needswap);
1882	/*
1883	* deallocate the fragment
1884	*/
1885	frags = ffs_numfrags(fs, size);
1886	for (i = `0`; i < frags; i++) {
1887	if (isset(blksfree, cgbno + i)) {
1888	panic("%s: freeing free frag: "
1889	"dev = 0x%llx, block = %" PRId64
1890	", fs = %s", __func__,
1891	(unsigned long long)dev, bno + i,
1892	fs->fs_fsmnt);
1893	}
1894	setbit(blksfree, cgbno + i);
1895	}
1896	ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
1897	fs->fs_cstotal.cs_nffree += i;
1898	fs->fs_cs(fs, cg).cs_nffree += i;
1899	/*
1900	* add back in counts associated with the new frags
1901	*/
1902	blk = blkmap(fs, blksfree, bbase);
1903	ffs_fragacct(fs, blk, cgp->cg_frsum, `1`, needswap);
1904	/*
1905	* if a complete block has been reassembled, account for it
1906	*/
1907	fragno = ffs_fragstoblks(fs, bbase);
1908	if (ffs_isblock(fs, blksfree, fragno)) {
1909	ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
1910	fs->fs_cstotal.cs_nffree -= fs->fs_frag;
1911	fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
1912	ffs_clusteracct(fs, cgp, fragno, `1`);
1913	ufs_add32(cgp->cg_cs.cs_nbfree, `1`, needswap);
1914	fs->fs_cstotal.cs_nbfree++;
1915	fs->fs_cs(fs, cg).cs_nbfree++;
1916	if ((fs->fs_magic == FS_UFS1_MAGIC) &&
1917	((fs->fs_old_flags & FS_FLAGS_UPDATED) == `0`)) {
1918	i = old_cbtocylno(fs, bbase);
1919	KASSERT(i >= `0`);
1920	KASSERT(i < fs->fs_old_ncyl);
1921	KASSERT(old_cbtorpos(fs, bbase) >= `0`);
1922	KASSERT(fs->fs_old_nrpos == `0` \|\| old_cbtorpos(fs, bbase) < fs->fs_old_nrpos);
1923	ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
1924	bbase)], `1`, needswap);
1925	ufs_add32(old_cg_blktot(cgp, needswap)[i], `1`, needswap);
1926	}
1927	}
1928	}
1929	fs->fs_fmod = `1`;
1930	ACTIVECG_CLR(fs, cg);
1931	mutex_exit(&ump->um_lock);
1932	}
1933
1934	/*
1935	* Free an inode.
1936	*/
1937	int
1938	ffs_vfree(struct vnode vp, ino_t ino, int* mode)
1939	{
1940
1941	return ffs_freefile(vp->v_mount, ino, mode);
1942	}
1943
1944	/*
1945	* Do the actual free operation.
1946	* The specified inode is placed back in the free map.
1947	*
1948	* => um_lock not held on entry or exit
1949	*/
1950	int
1951	ffs_freefile(struct mount mp, ino_t ino, int* mode)
1952	{
1953	struct ufsmount *ump = VFSTOUFS(mp);
1954	struct fs *fs = ump->um_fs;
1955	struct vnode *devvp;
1956	struct cg *cgp;
1957	struct buf *bp;
1958	int error, cg;
1959	daddr_t cgbno;
1960	dev_t dev;
1961	const int needswap = UFS_FSNEEDSWAP(fs);
1962
1963	cg = ino_to_cg(fs, ino);
1964	devvp = ump->um_devvp;
1965	dev = devvp->v_rdev;
1966	cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
1967
1968	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
1969	panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
1970	(long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
1971	error = bread(devvp, cgbno, (int)fs->fs_cgsize,
1972	B_MODIFY, &bp);
1973	if (error) {
1974	return (error);
1975	}
1976	cgp = (struct cg *)bp->b_data;
1977	if (!cg_chkmagic(cgp, needswap)) {
1978	brelse(bp, `0`);
1979	return (`0`);
1980	}
1981
1982	ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);
1983
1984	bdwrite(bp);
1985
1986	return `0`;
1987	}
1988
1989	int
1990	ffs_freefile_snap(struct fs fs, struct* vnode devvp, ino_t ino, int* mode)
1991	{
1992	struct ufsmount *ump;
1993	struct cg *cgp;
1994	struct buf *bp;
1995	int error, cg;
1996	daddr_t cgbno;
1997	dev_t dev;
1998	const int needswap = UFS_FSNEEDSWAP(fs);
1999
2000	KASSERT(devvp->v_type != VBLK);
2001
2002	cg = ino_to_cg(fs, ino);
2003	dev = VTOI(devvp)->i_devvp->v_rdev;
2004	ump = VFSTOUFS(devvp->v_mount);
2005	cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
2006	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
2007	panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
2008	(unsigned long long)dev, (unsigned long long)ino,
2009	fs->fs_fsmnt);
2010	error = bread(devvp, cgbno, (int)fs->fs_cgsize,
2011	B_MODIFY, &bp);
2012	if (error) {
2013	return (error);
2014	}
2015	cgp = (struct cg *)bp->b_data;
2016	if (!cg_chkmagic(cgp, needswap)) {
2017	brelse(bp, `0`);
2018	return (`0`);
2019	}
2020	ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);
2021
2022	bdwrite(bp);
2023
2024	return `0`;
2025	}
2026
2027	static void
2028	ffs_freefile_common(struct ufsmount ump, struct* fs *fs, dev_t dev,
2029	struct buf bp, ino_t ino, int* mode, bool devvp_is_snapshot)
2030	{
2031	int cg;
2032	struct cg *cgp;
2033	u_int8_t *inosused;
2034	const int needswap = UFS_FSNEEDSWAP(fs);
2035
2036	cg = ino_to_cg(fs, ino);
2037	cgp = (struct cg *)bp->b_data;
2038	cgp->cg_old_time = ufs_rw32(time_second, needswap);
2039	if ((fs->fs_magic != FS_UFS1_MAGIC) \|\|
2040	(fs->fs_old_flags & FS_FLAGS_UPDATED))
2041	cgp->cg_time = ufs_rw64(time_second, needswap);
2042	inosused = cg_inosused(cgp, needswap);
2043	ino %= fs->fs_ipg;
2044	if (isclr(inosused, ino)) {
2045	printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
2046	(unsigned long long)dev, (unsigned long long)ino +
2047	cg * fs->fs_ipg, fs->fs_fsmnt);
2048	if (fs->fs_ronly == `0`)
2049	panic("%s: freeing free inode", __func__);
2050	}
2051	clrbit(inosused, ino);
2052	if (!devvp_is_snapshot)
2053	UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp,
2054	ino + cg * fs->fs_ipg, mode);
2055	if (ino < ufs_rw32(cgp->cg_irotor, needswap))
2056	cgp->cg_irotor = ufs_rw32(ino, needswap);
2057	ufs_add32(cgp->cg_cs.cs_nifree, `1`, needswap);
2058	mutex_enter(&ump->um_lock);
2059	fs->fs_cstotal.cs_nifree++;
2060	fs->fs_cs(fs, cg).cs_nifree++;
2061	if ((mode & IFMT) == IFDIR) {
2062	ufs_add32(cgp->cg_cs.cs_ndir, -`1`, needswap);
2063	fs->fs_cstotal.cs_ndir--;
2064	fs->fs_cs(fs, cg).cs_ndir--;
2065	}
2066	fs->fs_fmod = `1`;
2067	ACTIVECG_CLR(fs, cg);
2068	mutex_exit(&ump->um_lock);
2069	}
2070
2071	/*
2072	* Check to see if a file is free.
2073	*/
2074	int
2075	ffs_checkfreefile(struct fs fs, struct* vnode *devvp, ino_t ino)
2076	{
2077	struct cg *cgp;
2078	struct buf *bp;
2079	daddr_t cgbno;
2080	int ret, cg;
2081	u_int8_t *inosused;
2082	const bool devvp_is_snapshot = (devvp->v_type != VBLK);
2083
2084	KASSERT(devvp_is_snapshot);
2085
2086	cg = ino_to_cg(fs, ino);
2087	if (devvp_is_snapshot)
2088	cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
2089	else
2090	cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
2091	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
2092	return `1`;
2093	if (bread(devvp, cgbno, (int)fs->fs_cgsize, `0`, &bp)) {
2094	return `1`;
2095	}
2096	cgp = (struct cg *)bp->b_data;
2097	if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
2098	brelse(bp, `0`);
2099	return `1`;
2100	}
2101	inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
2102	ino %= fs->fs_ipg;
2103	ret = isclr(inosused, ino);
2104	brelse(bp, `0`);
2105	return ret;
2106	}
2107
2108	/*
2109	* Find a block of the specified size in the specified cylinder group.
2110	*
2111	* It is a panic if a request is made to find a block if none are
2112	* available.
2113	*/
2114	static int32_t
2115	ffs_mapsearch(struct fs fs, struct* cg cgp, daddr_t bpref, int* allocsiz)
2116	{
2117	int32_t bno;
2118	int start, len, loc, i;
2119	int blk, field, subfield, pos;
2120	int ostart, olen;
2121	u_int8_t *blksfree;
2122	const int needswap = UFS_FSNEEDSWAP(fs);
2123
2124	/ KASSERT(mutex_owned(&ump->um_lock)); /
2125
2126	/*
2127	* find the fragment by searching through the free block
2128	* map for an appropriate bit pattern
2129	*/
2130	if (bpref)
2131	start = dtogd(fs, bpref) / NBBY;
2132	else
2133	start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY;
2134	blksfree = cg_blksfree(cgp, needswap);
2135	len = howmany(fs->fs_fpg, NBBY) - start;
2136	ostart = start;
2137	olen = len;
2138	loc = scanc((u_int)len,
2139	(const u_char *)&blksfree[start],
2140	(const u_char *)fragtbl[fs->fs_frag],
2141	(`1` << (allocsiz - `1` + (fs->fs_frag & (NBBY - `1`)))));
2142	if (loc == `0`) {
2143	len = start + `1`;
2144	start = `0`;
2145	loc = scanc((u_int)len,
2146	(const u_char *)&blksfree[`0`],
2147	(const u_char *)fragtbl[fs->fs_frag],
2148	(`1` << (allocsiz - `1` + (fs->fs_frag & (NBBY - `1`)))));
2149	if (loc == `0`) {
2150	panic("%s: map corrupted: start=%d, len=%d, "
2151	"fs = %s, offset=%d/%ld, cg %d", __func__,
2152	ostart, olen, fs->fs_fsmnt,
2153	ufs_rw32(cgp->cg_freeoff, needswap),
2154	(long)blksfree - (long)cgp, cgp->cg_cgx);
2155	/ NOTREACHED /
2156	}
2157	}
2158	bno = (start + len - loc) * NBBY;
2159	cgp->cg_frotor = ufs_rw32(bno, needswap);
2160	/*
2161	* found the byte in the map
2162	* sift through the bits to find the selected frag
2163	*/
2164	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
2165	blk = blkmap(fs, blksfree, bno);
2166	blk <<= `1`;
2167	field = around[allocsiz];
2168	subfield = inside[allocsiz];
2169	for (pos = `0`; pos <= fs->fs_frag - allocsiz; pos++) {
2170	if ((blk & field) == subfield)
2171	return (bno + pos);
2172	field <<= `1`;
2173	subfield <<= `1`;
2174	}
2175	}
2176	panic("%s: block not in map: bno=%d, fs=%s", __func__,
2177	bno, fs->fs_fsmnt);
2178	/ return (-1); /
2179	}
2180
2181	/*
2182	* Fserr prints the name of a file system with an error diagnostic.
2183	*
2184	* The form of the error message is:
2185	* fs: error message
2186	*/
2187	static void
2188	ffs_fserr(struct fs fs, kauth_cred_t cred, const* char *cp)
2189	{
2190	KASSERT(cred != NULL);
2191
2192	if (cred == NOCRED \|\| cred == FSCRED) {
2193	log(LOG_ERR, "pid %d, command %s, on %s: %s\n",
2194	curproc->p_pid, curproc->p_comm,
2195	fs->fs_fsmnt, cp);
2196	} else {
2197	log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
2198	kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm,
2199	fs->fs_fsmnt, cp);
2200	}
2201	}
2202

Browse the source code of src/src/sys/ufs/ffs/ffs_alloc.c