1  // SPDX-License-Identifier: GPL-2.0-only
2  
3  /*
4   * rcuref - A scalable reference count implementation for RCU managed objects
5   *
6   * rcuref is provided to replace open coded reference count implementations
7   * based on atomic_t. It protects explicitely RCU managed objects which can
8   * be visible even after the last reference has been dropped and the object
9   * is heading towards destruction.
10   *
11   * A common usage pattern is:
12   *
13   * get()
14   *	rcu_read_lock();
15   *	p = get_ptr();
16   *	if (p && !atomic_inc_not_zero(&p->refcnt))
17   *		p = NULL;
18   *	rcu_read_unlock();
19   *	return p;
20   *
21   * put()
22   *	if (!atomic_dec_return(&->refcnt)) {
23   *		remove_ptr(p);
24   *		kfree_rcu((p, rcu);
25   *	}
26   *
27   * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
28   * O(N^2) behaviour under contention with N concurrent operations.
29   *
30   * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales
31   * better under contention.
32   *
33   * Why not refcount?
34   * =================
35   *
36   * In principle it should be possible to make refcount use the rcuref
37   * scheme, but the destruction race described below cannot be prevented
38   * unless the protected object is RCU managed.
39   *
40   * Theory of operation
41   * ===================
42   *
43   * rcuref uses an unsigned integer reference counter. As long as the
44   * counter value is greater than or equal to RCUREF_ONEREF and not larger
45   * than RCUREF_MAXREF the reference is alive:
46   *
47   * ONEREF   MAXREF               SATURATED             RELEASED      DEAD    NOREF
48   * 0        0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
49   * <---valid --------> <-------saturation zone-------> <-----dead zone----->
50   *
51   * The get() and put() operations do unconditional increments and
52   * decrements. The result is checked after the operation. This optimizes
53   * for the fast path.
54   *
55   * If the reference count is saturated or dead, then the increments and
56   * decrements are not harmful as the reference count still stays in the
57   * respective zones and is always set back to STATURATED resp. DEAD. The
58   * zones have room for 2^28 racing operations in each direction, which
59   * makes it practically impossible to escape the zones.
60   *
61   * Once the last reference is dropped the reference count becomes
62   * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
63   * slowpath then tries to set the reference count from RCUREF_NOREF to
64   * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
65   * concurrent rcuref_get() can acquire the reference count and bring it
66   * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
67   *
68   * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
69   * DEAD + 1, which is inside the dead zone. If that happens the reference
70   * count is put back to DEAD.
71   *
72   * The actual race is possible due to the unconditional increment and
73   * decrements in rcuref_get() and rcuref_put():
74   *
75   *	T1				T2
76   *	get()				put()
77   *					if (atomic_add_negative(-1, &ref->refcnt))
78   *		succeeds->			atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);
79   *
80   *	atomic_add_negative(1, &ref->refcnt);	<- Elevates refcount to DEAD + 1
81   *
82   * As the result of T1's add is negative, the get() goes into the slow path
83   * and observes refcnt being in the dead zone which makes the operation fail.
84   *
85   * Possible critical states:
86   *
87   *	Context Counter	References	Operation
88   *	T1	0	1		init()
89   *	T2	1	2		get()
90   *	T1	0	1		put()
91   *	T2     -1	0		put() tries to mark dead
92   *	T1	0	1		get()
93   *	T2	0	1		put() mark dead fails
94   *	T1     -1	0		put() tries to mark dead
95   *	T1    DEAD	0		put() mark dead succeeds
96   *	T2    DEAD+1	0		get() fails and puts it back to DEAD
97   *
98   * Of course there are more complex scenarios, but the above illustrates
99   * the working principle. The rest is left to the imagination of the
100   * reader.
101   *
102   * Deconstruction race
103   * ===================
104   *
105   * The release operation must be protected by prohibiting a grace period in
106   * order to prevent a possible use after free:
107   *
108   *	T1				T2
109   *	put()				get()
110   *	// ref->refcnt = ONEREF
111   *	if (!atomic_add_negative(-1, &ref->refcnt))
112   *		return false;				<- Not taken
113   *
114   *	// ref->refcnt == NOREF
115   *	--> preemption
116   *					// Elevates ref->refcnt to ONEREF
117   *					if (!atomic_add_negative(1, &ref->refcnt))
118   *						return true;			<- taken
119   *
120   *					if (put(&p->ref)) { <-- Succeeds
121   *						remove_pointer(p);
122   *						kfree_rcu(p, rcu);
123   *					}
124   *
125   *		RCU grace period ends, object is freed
126   *
127   *	atomic_cmpxchg(&ref->refcnt, NOREF, DEAD);	<- UAF
128   *
129   * This is prevented by disabling preemption around the put() operation as
130   * that's in most kernel configurations cheaper than a rcu_read_lock() /
131   * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
132   * prevents the grace period which keeps the object alive until all put()
133   * operations complete.
134   *
135   * Saturation protection
136   * =====================
137   *
138   * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
139   * Once this is exceedded the reference count becomes stale by setting it
140   * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
141   * wrap arounds which obviously cause worse problems than a memory
142   * leak. When saturation is reached a warning is emitted.
143   *
144   * Race conditions
145   * ===============
146   *
147   * All reference count increment/decrement operations are unconditional and
148   * only verified after the fact. This optimizes for the good case and takes
149   * the occasional race vs. a dead or already saturated refcount into
150   * account. The saturation and dead zones are large enough to accomodate
151   * for that.
152   *
153   * Memory ordering
154   * ===============
155   *
156   * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
157   * and provide only what is strictly required for refcounts.
158   *
159   * The increments are fully relaxed; these will not provide ordering. The
160   * rationale is that whatever is used to obtain the object to increase the
161   * reference count on will provide the ordering. For locked data
162   * structures, its the lock acquire, for RCU/lockless data structures its
163   * the dependent load.
164   *
165   * rcuref_get() provides a control dependency ordering future stores which
166   * ensures that the object is not modified when acquiring a reference
167   * fails.
168   *
169   * rcuref_put() provides release order, i.e. all prior loads and stores
170   * will be issued before. It also provides a control dependency ordering
171   * against the subsequent destruction of the object.
172   *
173   * If rcuref_put() successfully dropped the last reference and marked the
174   * object DEAD it also provides acquire ordering.
175   */
176  
177  #include <linux/export.h>
178  #include <linux/rcuref.h>
179  
180  /**
181   * rcuref_get_slowpath - Slowpath of rcuref_get()
182   * @ref:	Pointer to the reference count
183   *
184   * Invoked when the reference count is outside of the valid zone.
185   *
186   * Return:
187   *	False if the reference count was already marked dead
188   *
189   *	True if the reference count is saturated, which prevents the
190   *	object from being deconstructed ever.
191   */
rcuref_get_slowpath(rcuref_t * ref)192  bool rcuref_get_slowpath(rcuref_t *ref)
193  {
194  	unsigned int cnt = atomic_read(&ref->refcnt);
195  
196  	/*
197  	 * If the reference count was already marked dead, undo the
198  	 * increment so it stays in the middle of the dead zone and return
199  	 * fail.
200  	 */
201  	if (cnt >= RCUREF_RELEASED) {
202  		atomic_set(&ref->refcnt, RCUREF_DEAD);
203  		return false;
204  	}
205  
206  	/*
207  	 * If it was saturated, warn and mark it so. In case the increment
208  	 * was already on a saturated value restore the saturation
209  	 * marker. This keeps it in the middle of the saturation zone and
210  	 * prevents the reference count from overflowing. This leaks the
211  	 * object memory, but prevents the obvious reference count overflow
212  	 * damage.
213  	 */
214  	if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory"))
215  		atomic_set(&ref->refcnt, RCUREF_SATURATED);
216  	return true;
217  }
218  EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
219  
220  /**
221   * rcuref_put_slowpath - Slowpath of __rcuref_put()
222   * @ref:	Pointer to the reference count
223   *
224   * Invoked when the reference count is outside of the valid zone.
225   *
226   * Return:
227   *	True if this was the last reference with no future references
228   *	possible. This signals the caller that it can safely schedule the
229   *	object, which is protected by the reference counter, for
230   *	deconstruction.
231   *
232   *	False if there are still active references or the put() raced
233   *	with a concurrent get()/put() pair. Caller is not allowed to
234   *	deconstruct the protected object.
235   */
rcuref_put_slowpath(rcuref_t * ref)236  bool rcuref_put_slowpath(rcuref_t *ref)
237  {
238  	unsigned int cnt = atomic_read(&ref->refcnt);
239  
240  	/* Did this drop the last reference? */
241  	if (likely(cnt == RCUREF_NOREF)) {
242  		/*
243  		 * Carefully try to set the reference count to RCUREF_DEAD.
244  		 *
245  		 * This can fail if a concurrent get() operation has
246  		 * elevated it again or the corresponding put() even marked
247  		 * it dead already. Both are valid situations and do not
248  		 * require a retry. If this fails the caller is not
249  		 * allowed to deconstruct the object.
250  		 */
251  		if (!atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD))
252  			return false;
253  
254  		/*
255  		 * The caller can safely schedule the object for
256  		 * deconstruction. Provide acquire ordering.
257  		 */
258  		smp_acquire__after_ctrl_dep();
259  		return true;
260  	}
261  
262  	/*
263  	 * If the reference count was already in the dead zone, then this
264  	 * put() operation is imbalanced. Warn, put the reference count back to
265  	 * DEAD and tell the caller to not deconstruct the object.
266  	 */
267  	if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
268  		atomic_set(&ref->refcnt, RCUREF_DEAD);
269  		return false;
270  	}
271  
272  	/*
273  	 * This is a put() operation on a saturated refcount. Restore the
274  	 * mean saturation value and tell the caller to not deconstruct the
275  	 * object.
276  	 */
277  	if (cnt > RCUREF_MAXREF)
278  		atomic_set(&ref->refcnt, RCUREF_SATURATED);
279  	return false;
280  }
281  EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
282