1  /* SPDX-License-Identifier: GPL-2.0 */
2  #ifndef _BCACHEFS_JOURNAL_TYPES_H
3  #define _BCACHEFS_JOURNAL_TYPES_H
4  
5  #include <linux/cache.h>
6  #include <linux/workqueue.h>
7  
8  #include "alloc_types.h"
9  #include "super_types.h"
10  #include "fifo.h"
11  
12  #define JOURNAL_BUF_BITS	2
13  #define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
14  #define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
15  
16  /*
17   * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
18   * the journal that are being staged or in flight.
19   */
20  struct journal_buf {
21  	struct closure		io;
22  	struct jset		*data;
23  
24  	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
25  	struct bch_devs_list	devs_written;
26  
27  	struct closure_waitlist	wait;
28  	u64			last_seq;	/* copy of data->last_seq */
29  	long			expires;
30  	u64			flush_time;
31  
32  	unsigned		buf_size;	/* size in bytes of @data */
33  	unsigned		sectors;	/* maximum size for current entry */
34  	unsigned		disk_sectors;	/* maximum size entry could have been, if
35  						   buf_size was bigger */
36  	unsigned		u64s_reserved;
37  	bool			noflush:1;	/* write has already been kicked off, and was noflush */
38  	bool			must_flush:1;	/* something wants a flush */
39  	bool			separate_flush:1;
40  	bool			need_flush_to_write_buffer:1;
41  	bool			write_started:1;
42  	bool			write_allocated:1;
43  	bool			write_done:1;
44  	u8			idx;
45  };
46  
47  /*
48   * Something that makes a journal entry dirty - i.e. a btree node that has to be
49   * flushed:
50   */
51  
52  enum journal_pin_type {
53  	JOURNAL_PIN_btree,
54  	JOURNAL_PIN_key_cache,
55  	JOURNAL_PIN_other,
56  	JOURNAL_PIN_NR,
57  };
58  
59  struct journal_entry_pin_list {
60  	struct list_head		list[JOURNAL_PIN_NR];
61  	struct list_head		flushed;
62  	atomic_t			count;
63  	struct bch_devs_list		devs;
64  };
65  
66  struct journal;
67  struct journal_entry_pin;
68  typedef int (*journal_pin_flush_fn)(struct journal *j,
69  				struct journal_entry_pin *, u64);
70  
71  struct journal_entry_pin {
72  	struct list_head		list;
73  	journal_pin_flush_fn		flush;
74  	u64				seq;
75  };
76  
77  struct journal_res {
78  	bool			ref;
79  	u8			idx;
80  	u16			u64s;
81  	u32			offset;
82  	u64			seq;
83  };
84  
85  union journal_res_state {
86  	struct {
87  		atomic64_t	counter;
88  	};
89  
90  	struct {
91  		u64		v;
92  	};
93  
94  	struct {
95  		u64		cur_entry_offset:20,
96  				idx:2,
97  				unwritten_idx:2,
98  				buf0_count:10,
99  				buf1_count:10,
100  				buf2_count:10,
101  				buf3_count:10;
102  	};
103  };
104  
105  /* bytes: */
106  #define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
107  #define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
108  
109  /*
110   * We stash some journal state as sentinal values in cur_entry_offset:
111   * note - cur_entry_offset is in units of u64s
112   */
113  #define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
114  
115  #define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
116  #define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
117  
118  struct journal_space {
119  	/* Units of 512 bytes sectors: */
120  	unsigned	next_entry; /* How big the next journal entry can be */
121  	unsigned	total;
122  };
123  
124  enum journal_space_from {
125  	journal_space_discarded,
126  	journal_space_clean_ondisk,
127  	journal_space_clean,
128  	journal_space_total,
129  	journal_space_nr,
130  };
131  
132  #define JOURNAL_FLAGS()			\
133  	x(replay_done)			\
134  	x(running)			\
135  	x(may_skip_flush)		\
136  	x(need_flush_write)		\
137  	x(space_low)
138  
139  enum journal_flags {
140  #define x(n)	JOURNAL_##n,
141  	JOURNAL_FLAGS()
142  #undef x
143  };
144  
145  /* Reasons we may fail to get a journal reservation: */
146  #define JOURNAL_ERRORS()		\
147  	x(ok)				\
148  	x(retry)			\
149  	x(blocked)			\
150  	x(max_in_flight)		\
151  	x(journal_full)			\
152  	x(journal_pin_full)		\
153  	x(journal_stuck)		\
154  	x(insufficient_devices)
155  
156  enum journal_errors {
157  #define x(n)	JOURNAL_ERR_##n,
158  	JOURNAL_ERRORS()
159  #undef x
160  };
161  
162  typedef DARRAY(u64)		darray_u64;
163  
164  struct journal_bio {
165  	struct bch_dev		*ca;
166  	unsigned		buf_idx;
167  
168  	struct bio		bio;
169  };
170  
171  /* Embedded in struct bch_fs */
172  struct journal {
173  	/* Fastpath stuff up front: */
174  	struct {
175  
176  	union journal_res_state reservations;
177  	enum bch_watermark	watermark;
178  
179  	} __aligned(SMP_CACHE_BYTES);
180  
181  	unsigned long		flags;
182  
183  	/* Max size of current journal entry */
184  	unsigned		cur_entry_u64s;
185  	unsigned		cur_entry_sectors;
186  
187  	/* Reserved space in journal entry to be used just prior to write */
188  	unsigned		entry_u64s_reserved;
189  
190  
191  	/*
192  	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
193  	 * insufficient devices:
194  	 */
195  	enum journal_errors	cur_entry_error;
196  
197  	unsigned		buf_size_want;
198  	/*
199  	 * We may queue up some things to be journalled (log messages) before
200  	 * the journal has actually started - stash them here:
201  	 */
202  	darray_u64		early_journal_entries;
203  
204  	/*
205  	 * Protects journal_buf->data, when accessing without a jorunal
206  	 * reservation: for synchronization between the btree write buffer code
207  	 * and the journal write path:
208  	 */
209  	struct mutex		buf_lock;
210  	/*
211  	 * Two journal entries -- one is currently open for new entries, the
212  	 * other is possibly being written out.
213  	 */
214  	struct journal_buf	buf[JOURNAL_BUF_NR];
215  
216  	spinlock_t		lock;
217  
218  	/* if nonzero, we may not open a new journal entry: */
219  	unsigned		blocked;
220  
221  	/* Used when waiting because the journal was full */
222  	wait_queue_head_t	wait;
223  	struct closure_waitlist	async_wait;
224  
225  	struct delayed_work	write_work;
226  	struct workqueue_struct *wq;
227  
228  	/* Sequence number of most recent journal entry (last entry in @pin) */
229  	atomic64_t		seq;
230  
231  	/* seq, last_seq from the most recent journal entry successfully written */
232  	u64			seq_ondisk;
233  	u64			flushed_seq_ondisk;
234  	u64			last_seq_ondisk;
235  	u64			err_seq;
236  	u64			last_empty_seq;
237  	u64			oldest_seq_found_ondisk;
238  
239  	/*
240  	 * FIFO of journal entries whose btree updates have not yet been
241  	 * written out.
242  	 *
243  	 * Each entry is a reference count. The position in the FIFO is the
244  	 * entry's sequence number relative to @seq.
245  	 *
246  	 * The journal entry itself holds a reference count, put when the
247  	 * journal entry is written out. Each btree node modified by the journal
248  	 * entry also holds a reference count, put when the btree node is
249  	 * written.
250  	 *
251  	 * When a reference count reaches zero, the journal entry is no longer
252  	 * needed. When all journal entries in the oldest journal bucket are no
253  	 * longer needed, the bucket can be discarded and reused.
254  	 */
255  	struct {
256  		u64 front, back, size, mask;
257  		struct journal_entry_pin_list *data;
258  	}			pin;
259  
260  	struct journal_space	space[journal_space_nr];
261  
262  	u64			replay_journal_seq;
263  	u64			replay_journal_seq_end;
264  
265  	struct write_point	wp;
266  	spinlock_t		err_lock;
267  
268  	struct mutex		reclaim_lock;
269  	/*
270  	 * Used for waiting until journal reclaim has freed up space in the
271  	 * journal:
272  	 */
273  	wait_queue_head_t	reclaim_wait;
274  	struct task_struct	*reclaim_thread;
275  	bool			reclaim_kicked;
276  	unsigned long		next_reclaim;
277  	u64			nr_direct_reclaim;
278  	u64			nr_background_reclaim;
279  
280  	unsigned long		last_flushed;
281  	struct journal_entry_pin *flush_in_progress;
282  	bool			flush_in_progress_dropped;
283  	wait_queue_head_t	pin_flush_wait;
284  
285  	/* protects advancing ja->discard_idx: */
286  	struct mutex		discard_lock;
287  	bool			can_discard;
288  
289  	unsigned long		last_flush_write;
290  
291  	u64			write_start_time;
292  
293  	u64			nr_flush_writes;
294  	u64			nr_noflush_writes;
295  	u64			entry_bytes_written;
296  
297  	struct bch2_time_stats	*flush_write_time;
298  	struct bch2_time_stats	*noflush_write_time;
299  	struct bch2_time_stats	*flush_seq_time;
300  
301  #ifdef CONFIG_DEBUG_LOCK_ALLOC
302  	struct lockdep_map	res_map;
303  #endif
304  } __aligned(SMP_CACHE_BYTES);
305  
306  /*
307   * Embedded in struct bch_dev. First three fields refer to the array of journal
308   * buckets, in bch_sb.
309   */
310  struct journal_device {
311  	/*
312  	 * For each journal bucket, contains the max sequence number of the
313  	 * journal writes it contains - so we know when a bucket can be reused.
314  	 */
315  	u64			*bucket_seq;
316  
317  	unsigned		sectors_free;
318  
319  	/*
320  	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
321  	 */
322  	unsigned		discard_idx;		/* Next bucket to discard */
323  	unsigned		dirty_idx_ondisk;
324  	unsigned		dirty_idx;
325  	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
326  	unsigned		nr;
327  
328  	u64			*buckets;
329  
330  	/* Bio for journal reads/writes to this device */
331  	struct journal_bio	*bio[JOURNAL_BUF_NR];
332  
333  	/* for bch_journal_read_device */
334  	struct closure		read;
335  	u64			highest_seq_found;
336  };
337  
338  /*
339   * journal_entry_res - reserve space in every journal entry:
340   */
341  struct journal_entry_res {
342  	unsigned		u64s;
343  };
344  
345  #endif /* _BCACHEFS_JOURNAL_TYPES_H */
346