fs/xfs/scrub/newbt.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_btree_staging.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
#include "xfs_defer.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/newbt.h"

/*
 * Estimate proper slack values for a btree that's being reloaded.
 *
 * Under most circumstances, we'll take whatever default loading value the
 * btree bulk loading code calculates for us.  However, there are some
 * exceptions to this rule:
 *
 * (0) If someone turned one of the debug knobs.
 * (1) If this is a per-AG btree and the AG has less than 10% space free.
 * (2) If this is an inode btree and the FS has less than 10% space free.

 * In either case, format the new btree blocks almost completely full to
 * minimize space usage.
 */
static void
xrep_newbt_estimate_slack(
	struct xrep_newbt	*xnr)
{
	struct xfs_scrub	*sc = xnr->sc;
	struct xfs_btree_bload	*bload = &xnr->bload;
	uint64_t		free;
	uint64_t		sz;

	/*
	 * The xfs_globals values are set to -1 (i.e. take the bload defaults)
	 * unless someone has set them otherwise, so we just pull the values
	 * here.
	 */
	bload->leaf_slack = xfs_globals.bload_leaf_slack;
	bload->node_slack = xfs_globals.bload_node_slack;

	if (sc->ops->type == ST_PERAG) {
		free = sc->sa.pag->pagf_freeblks;
		sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
	} else {
		free = percpu_counter_sum(&sc->mp->m_fdblocks);
		sz = sc->mp->m_sb.sb_dblocks;
	}

	/* No further changes if there's more than 10% free space left. */
	if (free >= div_u64(sz, 10))
		return;

	/*
	 * We're low on space; load the btrees as tightly as possible.  Leave
	 * a couple of open slots in each btree block so that we don't end up
	 * splitting the btrees like crazy after a mount.
	 */
	if (bload->leaf_slack < 0)
		bload->leaf_slack = 2;
	if (bload->node_slack < 0)
		bload->node_slack = 2;
}

/* Initialize accounting resources for staging a new AG btree. */
void
xrep_newbt_init_ag(
	struct xrep_newbt		*xnr,
	struct xfs_scrub		*sc,
	const struct xfs_owner_info	*oinfo,
	xfs_fsblock_t			alloc_hint,
	enum xfs_ag_resv_type		resv)
{
	memset(xnr, 0, sizeof(struct xrep_newbt));
	xnr->sc = sc;
	xnr->oinfo = *oinfo; /* structure copy */
	xnr->alloc_hint = alloc_hint;
	xnr->resv = resv;
	INIT_LIST_HEAD(&xnr->resv_list);
	xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
	xrep_newbt_estimate_slack(xnr);
}

/* Initialize accounting resources for staging a new inode fork btree. */
int
xrep_newbt_init_inode(
	struct xrep_newbt		*xnr,
	struct xfs_scrub		*sc,
	int				whichfork,
	const struct xfs_owner_info	*oinfo)
{
	struct xfs_ifork		*ifp;

	ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
	if (!ifp)
		return -ENOMEM;

	xrep_newbt_init_ag(xnr, sc, oinfo,
			XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
			XFS_AG_RESV_NONE);
	xnr->ifake.if_fork = ifp;
	xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
	return 0;
}

/*
 * Initialize accounting resources for staging a new btree.  Callers are
 * expected to add their own reservations (and clean them up) manually.
 */
void
xrep_newbt_init_bare(
	struct xrep_newbt		*xnr,
	struct xfs_scrub		*sc)
{
	xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
			XFS_AG_RESV_NONE);
}

/*
 * Designate specific blocks to be used to build our new btree.  @pag must be
 * a passive reference.
 */
STATIC int
xrep_newbt_add_blocks(
	struct xrep_newbt		*xnr,
	struct xfs_perag		*pag,
	const struct xfs_alloc_arg	*args)
{
	struct xfs_mount		*mp = xnr->sc->mp;
	struct xrep_newbt_resv		*resv;
	int				error;

	resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
	if (!resv)
		return -ENOMEM;

	INIT_LIST_HEAD(&resv->list);
	resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
	resv->len = args->len;
	resv->used = 0;
	resv->pag = xfs_perag_hold(pag);

	if (args->tp) {
		ASSERT(xnr->oinfo.oi_offset == 0);

		error = xfs_alloc_schedule_autoreap(args,
				XFS_FREE_EXTENT_SKIP_DISCARD, &resv->autoreap);
		if (error)
			goto out_pag;
	}

	list_add_tail(&resv->list, &xnr->resv_list);
	return 0;
out_pag:
	xfs_perag_put(resv->pag);
	kfree(resv);
	return error;
}

/*
 * Add an extent to the new btree reservation pool.  Callers are required to
 * reap this reservation manually if the repair is cancelled.  @pag must be a
 * passive reference.
 */
int
xrep_newbt_add_extent(
	struct xrep_newbt	*xnr,
	struct xfs_perag	*pag,
	xfs_agblock_t		agbno,
	xfs_extlen_t		len)
{
	struct xfs_mount	*mp = xnr->sc->mp;
	struct xfs_alloc_arg	args = {
		.tp		= NULL, /* no autoreap */
		.oinfo		= xnr->oinfo,
		.fsbno		= XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
		.len		= len,
		.resv		= xnr->resv,
	};

	return xrep_newbt_add_blocks(xnr, pag, &args);
}

/* Don't let our allocation hint take us beyond this AG */
static inline void
xrep_newbt_validate_ag_alloc_hint(
	struct xrep_newbt	*xnr)
{
	struct xfs_scrub	*sc = xnr->sc;
	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);

	if (agno == sc->sa.pag->pag_agno &&
	    xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
		return;

	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
					 XFS_AGFL_BLOCK(sc->mp) + 1);
}

/* Allocate disk space for a new per-AG btree. */
STATIC int
xrep_newbt_alloc_ag_blocks(
	struct xrep_newbt	*xnr,
	uint64_t		nr_blocks)
{
	struct xfs_scrub	*sc = xnr->sc;
	struct xfs_mount	*mp = sc->mp;
	int			error = 0;

	ASSERT(sc->sa.pag != NULL);

	while (nr_blocks > 0) {
		struct xfs_alloc_arg	args = {
			.tp		= sc->tp,
			.mp		= mp,
			.oinfo		= xnr->oinfo,
			.minlen		= 1,
			.maxlen		= nr_blocks,
			.prod		= 1,
			.resv		= xnr->resv,
		};
		xfs_agnumber_t		agno;

		xrep_newbt_validate_ag_alloc_hint(xnr);

		if (xnr->alloc_vextent)
			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
		else
			error = xfs_alloc_vextent_near_bno(&args,
					xnr->alloc_hint);
		if (error)
			return error;
		if (args.fsbno == NULLFSBLOCK)
			return -ENOSPC;

		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);

		trace_xrep_newbt_alloc_ag_blocks(mp, agno,
				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
				xnr->oinfo.oi_owner);

		if (agno != sc->sa.pag->pag_agno) {
			ASSERT(agno == sc->sa.pag->pag_agno);
			return -EFSCORRUPTED;
		}

		error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
		if (error)
			return error;

		nr_blocks -= args.len;
		xnr->alloc_hint = args.fsbno + args.len;

		error = xrep_defer_finish(sc);
		if (error)
			return error;
	}

	return 0;
}

/* Don't let our allocation hint take us beyond EOFS */
static inline void
xrep_newbt_validate_file_alloc_hint(
	struct xrep_newbt	*xnr)
{
	struct xfs_scrub	*sc = xnr->sc;

	if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
		return;

	xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
}

/* Allocate disk space for our new file-based btree. */
STATIC int
xrep_newbt_alloc_file_blocks(
	struct xrep_newbt	*xnr,
	uint64_t		nr_blocks)
{
	struct xfs_scrub	*sc = xnr->sc;
	struct xfs_mount	*mp = sc->mp;
	int			error = 0;

	while (nr_blocks > 0) {
		struct xfs_alloc_arg	args = {
			.tp		= sc->tp,
			.mp		= mp,
			.oinfo		= xnr->oinfo,
			.minlen		= 1,
			.maxlen		= nr_blocks,
			.prod		= 1,
			.resv		= xnr->resv,
		};
		struct xfs_perag	*pag;
		xfs_agnumber_t		agno;

		xrep_newbt_validate_file_alloc_hint(xnr);

		if (xnr->alloc_vextent)
			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
		else
			error = xfs_alloc_vextent_start_ag(&args,
					xnr->alloc_hint);
		if (error)
			return error;
		if (args.fsbno == NULLFSBLOCK)
			return -ENOSPC;

		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);

		trace_xrep_newbt_alloc_file_blocks(mp, agno,
				XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
				xnr->oinfo.oi_owner);

		pag = xfs_perag_get(mp, agno);
		if (!pag) {
			ASSERT(0);
			return -EFSCORRUPTED;
		}

		error = xrep_newbt_add_blocks(xnr, pag, &args);
		xfs_perag_put(pag);
		if (error)
			return error;

		nr_blocks -= args.len;
		xnr->alloc_hint = args.fsbno + args.len;

		error = xrep_defer_finish(sc);
		if (error)
			return error;
	}

	return 0;
}

/* Allocate disk space for our new btree. */
int
xrep_newbt_alloc_blocks(
	struct xrep_newbt	*xnr,
	uint64_t		nr_blocks)
{
	if (xnr->sc->ip)
		return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
	return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
}

/*
 * Free the unused part of a space extent that was reserved for a new ondisk
 * structure.  Returns the number of EFIs logged or a negative errno.
 */
STATIC int
xrep_newbt_free_extent(
	struct xrep_newbt	*xnr,
	struct xrep_newbt_resv	*resv,
	bool			btree_committed)
{
	struct xfs_scrub	*sc = xnr->sc;
	xfs_agblock_t		free_agbno = resv->agbno;
	xfs_extlen_t		free_aglen = resv->len;
	xfs_fsblock_t		fsbno;
	int			error;

	if (!btree_committed || resv->used == 0) {
		/*
		 * If we're not committing a new btree or we didn't use the
		 * space reservation, let the existing EFI free the entire
		 * space extent.
		 */
		trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
				free_agbno, free_aglen, xnr->oinfo.oi_owner);
		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
		return 1;
	}

	/*
	 * We used space and committed the btree.  Cancel the autoreap, remove
	 * the written blocks from the reservation, and possibly log a new EFI
	 * to free any unused reservation space.
	 */
	xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
	free_agbno += resv->used;
	free_aglen -= resv->used;

	if (free_aglen == 0)
		return 0;

	trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
			free_aglen, xnr->oinfo.oi_owner);

	ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
	ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);

	/*
	 * Use EFIs to free the reservations.  This reduces the chance
	 * that we leak blocks if the system goes down.
	 */
	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
	error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
			xnr->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
	if (error)
		return error;

	return 1;
}

/* Free all the accounting info and disk space we reserved for a new btree. */
STATIC int
xrep_newbt_free(
	struct xrep_newbt	*xnr,
	bool			btree_committed)
{
	struct xfs_scrub	*sc = xnr->sc;
	struct xrep_newbt_resv	*resv, *n;
	unsigned int		freed = 0;
	int			error = 0;

	/*
	 * If the filesystem already went down, we can't free the blocks.  Skip
	 * ahead to freeing the incore metadata because we can't fix anything.
	 */
	if (xfs_is_shutdown(sc->mp))
		goto junkit;

	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
		int		ret;

		ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
		list_del(&resv->list);
		xfs_perag_put(resv->pag);
		kfree(resv);
		if (ret < 0) {
			error = ret;
			goto junkit;
		}

		freed += ret;
		if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
			error = xrep_defer_finish(sc);
			if (error)
				goto junkit;
			freed = 0;
		}
	}

	if (freed)
		error = xrep_defer_finish(sc);

junkit:
	/*
	 * If we still have reservations attached to @newbt, cleanup must have
	 * failed and the filesystem is about to go down.  Clean up the incore
	 * reservations and try to commit to freeing the space we used.
	 */
	list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
		xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
		list_del(&resv->list);
		xfs_perag_put(resv->pag);
		kfree(resv);
	}

	if (sc->ip) {
		kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
		xnr->ifake.if_fork = NULL;
	}

	return error;
}

/*
 * Free all the accounting info and unused disk space allocations after
 * committing a new btree.
 */
int
xrep_newbt_commit(
	struct xrep_newbt	*xnr)
{
	return xrep_newbt_free(xnr, true);
}

/*
 * Free all the accounting info and all of the disk space we reserved for a new
 * btree that we're not going to commit.  We want to try to roll things back
 * cleanly for things like ENOSPC midway through allocation.
 */
void
xrep_newbt_cancel(
	struct xrep_newbt	*xnr)
{
	xrep_newbt_free(xnr, false);
}

/* Feed one of the reserved btree blocks to the bulk loader. */
int
xrep_newbt_claim_block(
	struct xfs_btree_cur	*cur,
	struct xrep_newbt	*xnr,
	union xfs_btree_ptr	*ptr)
{
	struct xrep_newbt_resv	*resv;
	struct xfs_mount	*mp = cur->bc_mp;
	xfs_agblock_t		agbno;

	/*
	 * The first item in the list should always have a free block unless
	 * we're completely out.
	 */
	resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
	if (resv->used == resv->len)
		return -ENOSPC;

	/*
	 * Peel off a block from the start of the reservation.  We allocate
	 * blocks in order to place blocks on disk in increasing record or key
	 * order.  The block reservations tend to end up on the list in
	 * decreasing order, which hopefully results in leaf blocks ending up
	 * together.
	 */
	agbno = resv->agbno + resv->used;
	resv->used++;

	/* If we used all the blocks in this reservation, move it to the end. */
	if (resv->used == resv->len)
		list_move_tail(&resv->list, &xnr->resv_list);

	trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
			xnr->oinfo.oi_owner);

	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
		ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
								agbno));
	else
		ptr->s = cpu_to_be32(agbno);

	/* Relog all the EFIs. */
	return xrep_defer_finish(xnr->sc);
}

/* How many reserved blocks are unused? */
unsigned int
xrep_newbt_unused_blocks(
	struct xrep_newbt	*xnr)
{
	struct xrep_newbt_resv	*resv;
	unsigned int		unused = 0;

	list_for_each_entry(resv, &xnr->resv_list, list)
		unused += resv->len - resv->used;
	return unused;
}