1  // SPDX-License-Identifier: GPL-2.0
2  #include <linux/ceph/ceph_debug.h>
3  
4  #include <linux/file.h>
5  #include <linux/namei.h>
6  #include <linux/random.h>
7  
8  #include "super.h"
9  #include "mds_client.h"
10  #include <linux/filelock.h>
11  #include <linux/ceph/pagelist.h>
12  
13  static u64 lock_secret;
14  static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
15                                           struct ceph_mds_request *req);
16  
secure_addr(void * addr)17  static inline u64 secure_addr(void *addr)
18  {
19  	u64 v = lock_secret ^ (u64)(unsigned long)addr;
20  	/*
21  	 * Set the most significant bit, so that MDS knows the 'owner'
22  	 * is sufficient to identify the owner of lock. (old code uses
23  	 * both 'owner' and 'pid')
24  	 */
25  	v |= (1ULL << 63);
26  	return v;
27  }
28  
ceph_flock_init(void)29  void __init ceph_flock_init(void)
30  {
31  	get_random_bytes(&lock_secret, sizeof(lock_secret));
32  }
33  
ceph_fl_copy_lock(struct file_lock * dst,struct file_lock * src)34  static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
35  {
36  	struct inode *inode = file_inode(dst->c.flc_file);
37  	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
38  	dst->fl_u.ceph.inode = igrab(inode);
39  }
40  
41  /*
42   * Do not use the 'fl->fl_file' in release function, which
43   * is possibly already released by another thread.
44   */
ceph_fl_release_lock(struct file_lock * fl)45  static void ceph_fl_release_lock(struct file_lock *fl)
46  {
47  	struct inode *inode = fl->fl_u.ceph.inode;
48  	struct ceph_inode_info *ci;
49  
50  	/*
51  	 * If inode is NULL it should be a request file_lock,
52  	 * nothing we can do.
53  	 */
54  	if (!inode)
55  		return;
56  
57  	ci = ceph_inode(inode);
58  	if (atomic_dec_and_test(&ci->i_filelock_ref)) {
59  		/* clear error when all locks are released */
60  		spin_lock(&ci->i_ceph_lock);
61  		ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
62  		spin_unlock(&ci->i_ceph_lock);
63  	}
64  	fl->fl_u.ceph.inode = NULL;
65  	iput(inode);
66  }
67  
68  static const struct file_lock_operations ceph_fl_lock_ops = {
69  	.fl_copy_lock = ceph_fl_copy_lock,
70  	.fl_release_private = ceph_fl_release_lock,
71  };
72  
73  /*
74   * Implement fcntl and flock locking functions.
75   */
ceph_lock_message(u8 lock_type,u16 operation,struct inode * inode,int cmd,u8 wait,struct file_lock * fl)76  static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
77  			     int cmd, u8 wait, struct file_lock *fl)
78  {
79  	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
80  	struct ceph_client *cl = mdsc->fsc->client;
81  	struct ceph_mds_request *req;
82  	int err;
83  	u64 length = 0;
84  	u64 owner;
85  
86  	if (operation == CEPH_MDS_OP_SETFILELOCK) {
87  		/*
88  		 * increasing i_filelock_ref closes race window between
89  		 * handling request reply and adding file_lock struct to
90  		 * inode. Otherwise, auth caps may get trimmed in the
91  		 * window. Caller function will decrease the counter.
92  		 */
93  		fl->fl_ops = &ceph_fl_lock_ops;
94  		fl->fl_ops->fl_copy_lock(fl, NULL);
95  	}
96  
97  	if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
98  		wait = 0;
99  
100  	req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
101  	if (IS_ERR(req))
102  		return PTR_ERR(req);
103  	req->r_inode = inode;
104  	ihold(inode);
105  	req->r_num_caps = 1;
106  
107  	/* mds requires start and length rather than start and end */
108  	if (LLONG_MAX == fl->fl_end)
109  		length = 0;
110  	else
111  		length = fl->fl_end - fl->fl_start + 1;
112  
113  	owner = secure_addr(fl->c.flc_owner);
114  
115  	doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
116  		    "start: %llu, length: %llu, wait: %d, type: %d\n",
117  		    (int)lock_type, (int)operation, owner,
118  		    (u64) fl->c.flc_pid,
119  		    fl->fl_start, length, wait, fl->c.flc_type);
120  
121  	req->r_args.filelock_change.rule = lock_type;
122  	req->r_args.filelock_change.type = cmd;
123  	req->r_args.filelock_change.owner = cpu_to_le64(owner);
124  	req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid);
125  	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
126  	req->r_args.filelock_change.length = cpu_to_le64(length);
127  	req->r_args.filelock_change.wait = wait;
128  
129  	err = ceph_mdsc_submit_request(mdsc, inode, req);
130  	if (!err)
131  		err = ceph_mdsc_wait_request(mdsc, req, wait ?
132  					ceph_lock_wait_for_completion : NULL);
133  	if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
134  		fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
135  		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
136  			fl->c.flc_type = F_RDLCK;
137  		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
138  			fl->c.flc_type = F_WRLCK;
139  		else
140  			fl->c.flc_type = F_UNLCK;
141  
142  		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
143  		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
144  						 le64_to_cpu(req->r_reply_info.filelock_reply->length);
145  		if (length >= 1)
146  			fl->fl_end = length -1;
147  		else
148  			fl->fl_end = 0;
149  
150  	}
151  	ceph_mdsc_put_request(req);
152  	doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
153  	      "length: %llu, wait: %d, type: %d, err code %d\n",
154  	      (int)lock_type, (int)operation, (u64) fl->c.flc_pid,
155  	      fl->fl_start, length, wait, fl->c.flc_type, err);
156  	return err;
157  }
158  
ceph_lock_wait_for_completion(struct ceph_mds_client * mdsc,struct ceph_mds_request * req)159  static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
160                                           struct ceph_mds_request *req)
161  {
162  	struct ceph_client *cl = mdsc->fsc->client;
163  	struct ceph_mds_request *intr_req;
164  	struct inode *inode = req->r_inode;
165  	int err, lock_type;
166  
167  	BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK);
168  	if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL)
169  		lock_type = CEPH_LOCK_FCNTL_INTR;
170  	else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK)
171  		lock_type = CEPH_LOCK_FLOCK_INTR;
172  	else
173  		BUG_ON(1);
174  	BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK);
175  
176  	err = wait_for_completion_interruptible(&req->r_completion);
177  	if (!err)
178  		return 0;
179  
180  	doutc(cl, "request %llu was interrupted\n", req->r_tid);
181  
182  	mutex_lock(&mdsc->mutex);
183  	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
184  		err = 0;
185  	} else {
186  		/*
187  		 * ensure we aren't running concurrently with
188  		 * ceph_fill_trace or ceph_readdir_prepopulate, which
189  		 * rely on locks (dir mutex) held by our caller.
190  		 */
191  		mutex_lock(&req->r_fill_mutex);
192  		req->r_err = err;
193  		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
194  		mutex_unlock(&req->r_fill_mutex);
195  
196  		if (!req->r_session) {
197  			// haven't sent the request
198  			err = 0;
199  		}
200  	}
201  	mutex_unlock(&mdsc->mutex);
202  	if (!err)
203  		return 0;
204  
205  	intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
206  					    USE_AUTH_MDS);
207  	if (IS_ERR(intr_req))
208  		return PTR_ERR(intr_req);
209  
210  	intr_req->r_inode = inode;
211  	ihold(inode);
212  	intr_req->r_num_caps = 1;
213  
214  	intr_req->r_args.filelock_change = req->r_args.filelock_change;
215  	intr_req->r_args.filelock_change.rule = lock_type;
216  	intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK;
217  
218  	err = ceph_mdsc_do_request(mdsc, inode, intr_req);
219  	ceph_mdsc_put_request(intr_req);
220  
221  	if (err && err != -ERESTARTSYS)
222  		return err;
223  
224  	wait_for_completion_killable(&req->r_safe_completion);
225  	return 0;
226  }
227  
try_unlock_file(struct file * file,struct file_lock * fl)228  static int try_unlock_file(struct file *file, struct file_lock *fl)
229  {
230  	int err;
231  	unsigned int orig_flags = fl->c.flc_flags;
232  	fl->c.flc_flags |= FL_EXISTS;
233  	err = locks_lock_file_wait(file, fl);
234  	fl->c.flc_flags = orig_flags;
235  	if (err == -ENOENT) {
236  		if (!(orig_flags & FL_EXISTS))
237  			err = 0;
238  		return err;
239  	}
240  	return 1;
241  }
242  
243  /*
244   * Attempt to set an fcntl lock.
245   * For now, this just goes away to the server. Later it may be more awesome.
246   */
ceph_lock(struct file * file,int cmd,struct file_lock * fl)247  int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
248  {
249  	struct inode *inode = file_inode(file);
250  	struct ceph_inode_info *ci = ceph_inode(inode);
251  	struct ceph_client *cl = ceph_inode_to_client(inode);
252  	int err = 0;
253  	u16 op = CEPH_MDS_OP_SETFILELOCK;
254  	u8 wait = 0;
255  	u8 lock_cmd;
256  
257  	if (!(fl->c.flc_flags & FL_POSIX))
258  		return -ENOLCK;
259  
260  	if (ceph_inode_is_shutdown(inode))
261  		return -ESTALE;
262  
263  	doutc(cl, "fl_owner: %p\n", fl->c.flc_owner);
264  
265  	/* set wait bit as appropriate, then make command as Ceph expects it*/
266  	if (IS_GETLK(cmd))
267  		op = CEPH_MDS_OP_GETFILELOCK;
268  	else if (IS_SETLKW(cmd))
269  		wait = 1;
270  
271  	spin_lock(&ci->i_ceph_lock);
272  	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
273  		err = -EIO;
274  	}
275  	spin_unlock(&ci->i_ceph_lock);
276  	if (err < 0) {
277  		if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl))
278  			posix_lock_file(file, fl, NULL);
279  		return err;
280  	}
281  
282  	if (lock_is_read(fl))
283  		lock_cmd = CEPH_LOCK_SHARED;
284  	else if (lock_is_write(fl))
285  		lock_cmd = CEPH_LOCK_EXCL;
286  	else
287  		lock_cmd = CEPH_LOCK_UNLOCK;
288  
289  	if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) {
290  		err = try_unlock_file(file, fl);
291  		if (err <= 0)
292  			return err;
293  	}
294  
295  	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
296  	if (!err) {
297  		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) {
298  			doutc(cl, "locking locally\n");
299  			err = posix_lock_file(file, fl, NULL);
300  			if (err) {
301  				/* undo! This should only happen if
302  				 * the kernel detects local
303  				 * deadlock. */
304  				ceph_lock_message(CEPH_LOCK_FCNTL, op, inode,
305  						  CEPH_LOCK_UNLOCK, 0, fl);
306  				doutc(cl, "got %d on posix_lock_file, undid lock\n",
307  				      err);
308  			}
309  		}
310  	}
311  	return err;
312  }
313  
ceph_flock(struct file * file,int cmd,struct file_lock * fl)314  int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
315  {
316  	struct inode *inode = file_inode(file);
317  	struct ceph_inode_info *ci = ceph_inode(inode);
318  	struct ceph_client *cl = ceph_inode_to_client(inode);
319  	int err = 0;
320  	u8 wait = 0;
321  	u8 lock_cmd;
322  
323  	if (!(fl->c.flc_flags & FL_FLOCK))
324  		return -ENOLCK;
325  
326  	if (ceph_inode_is_shutdown(inode))
327  		return -ESTALE;
328  
329  	doutc(cl, "fl_file: %p\n", fl->c.flc_file);
330  
331  	spin_lock(&ci->i_ceph_lock);
332  	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
333  		err = -EIO;
334  	}
335  	spin_unlock(&ci->i_ceph_lock);
336  	if (err < 0) {
337  		if (lock_is_unlock(fl))
338  			locks_lock_file_wait(file, fl);
339  		return err;
340  	}
341  
342  	if (IS_SETLKW(cmd))
343  		wait = 1;
344  
345  	if (lock_is_read(fl))
346  		lock_cmd = CEPH_LOCK_SHARED;
347  	else if (lock_is_write(fl))
348  		lock_cmd = CEPH_LOCK_EXCL;
349  	else
350  		lock_cmd = CEPH_LOCK_UNLOCK;
351  
352  	if (lock_is_unlock(fl)) {
353  		err = try_unlock_file(file, fl);
354  		if (err <= 0)
355  			return err;
356  	}
357  
358  	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
359  				inode, lock_cmd, wait, fl);
360  	if (!err && F_UNLCK != fl->c.flc_type) {
361  		err = locks_lock_file_wait(file, fl);
362  		if (err) {
363  			ceph_lock_message(CEPH_LOCK_FLOCK,
364  					  CEPH_MDS_OP_SETFILELOCK,
365  					  inode, CEPH_LOCK_UNLOCK, 0, fl);
366  			doutc(cl, "got %d on locks_lock_file_wait, undid lock\n",
367  			      err);
368  		}
369  	}
370  	return err;
371  }
372  
373  /*
374   * Fills in the passed counter variables, so you can prepare pagelist metadata
375   * before calling ceph_encode_locks.
376   */
ceph_count_locks(struct inode * inode,int * fcntl_count,int * flock_count)377  void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
378  {
379  	struct ceph_client *cl = ceph_inode_to_client(inode);
380  	struct file_lock *lock;
381  	struct file_lock_context *ctx;
382  
383  	*fcntl_count = 0;
384  	*flock_count = 0;
385  
386  	ctx = locks_inode_context(inode);
387  	if (ctx) {
388  		spin_lock(&ctx->flc_lock);
389  		for_each_file_lock(lock, &ctx->flc_posix)
390  			++(*fcntl_count);
391  		for_each_file_lock(lock, &ctx->flc_flock)
392  			++(*flock_count);
393  		spin_unlock(&ctx->flc_lock);
394  	}
395  	doutc(cl, "counted %d flock locks and %d fcntl locks\n",
396  	      *flock_count, *fcntl_count);
397  }
398  
399  /*
400   * Given a pointer to a lock, convert it to a ceph filelock
401   */
lock_to_ceph_filelock(struct inode * inode,struct file_lock * lock,struct ceph_filelock * cephlock)402  static int lock_to_ceph_filelock(struct inode *inode,
403  				 struct file_lock *lock,
404  				 struct ceph_filelock *cephlock)
405  {
406  	struct ceph_client *cl = ceph_inode_to_client(inode);
407  	int err = 0;
408  
409  	cephlock->start = cpu_to_le64(lock->fl_start);
410  	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
411  	cephlock->client = cpu_to_le64(0);
412  	cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid);
413  	cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner));
414  
415  	switch (lock->c.flc_type) {
416  	case F_RDLCK:
417  		cephlock->type = CEPH_LOCK_SHARED;
418  		break;
419  	case F_WRLCK:
420  		cephlock->type = CEPH_LOCK_EXCL;
421  		break;
422  	case F_UNLCK:
423  		cephlock->type = CEPH_LOCK_UNLOCK;
424  		break;
425  	default:
426  		doutc(cl, "Have unknown lock type %d\n",
427  		      lock->c.flc_type);
428  		err = -EINVAL;
429  	}
430  
431  	return err;
432  }
433  
434  /*
435   * Encode the flock and fcntl locks for the given inode into the ceph_filelock
436   * array. Must be called with inode->i_lock already held.
437   * If we encounter more of a specific lock type than expected, return -ENOSPC.
438   */
ceph_encode_locks_to_buffer(struct inode * inode,struct ceph_filelock * flocks,int num_fcntl_locks,int num_flock_locks)439  int ceph_encode_locks_to_buffer(struct inode *inode,
440  				struct ceph_filelock *flocks,
441  				int num_fcntl_locks, int num_flock_locks)
442  {
443  	struct file_lock *lock;
444  	struct file_lock_context *ctx = locks_inode_context(inode);
445  	struct ceph_client *cl = ceph_inode_to_client(inode);
446  	int err = 0;
447  	int seen_fcntl = 0;
448  	int seen_flock = 0;
449  	int l = 0;
450  
451  	doutc(cl, "encoding %d flock and %d fcntl locks\n", num_flock_locks,
452  	      num_fcntl_locks);
453  
454  	if (!ctx)
455  		return 0;
456  
457  	spin_lock(&ctx->flc_lock);
458  	for_each_file_lock(lock, &ctx->flc_posix) {
459  		++seen_fcntl;
460  		if (seen_fcntl > num_fcntl_locks) {
461  			err = -ENOSPC;
462  			goto fail;
463  		}
464  		err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
465  		if (err)
466  			goto fail;
467  		++l;
468  	}
469  	for_each_file_lock(lock, &ctx->flc_flock) {
470  		++seen_flock;
471  		if (seen_flock > num_flock_locks) {
472  			err = -ENOSPC;
473  			goto fail;
474  		}
475  		err = lock_to_ceph_filelock(inode, lock, &flocks[l]);
476  		if (err)
477  			goto fail;
478  		++l;
479  	}
480  fail:
481  	spin_unlock(&ctx->flc_lock);
482  	return err;
483  }
484  
485  /*
486   * Copy the encoded flock and fcntl locks into the pagelist.
487   * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
488   * sequential flock locks.
489   * Returns zero on success.
490   */
ceph_locks_to_pagelist(struct ceph_filelock * flocks,struct ceph_pagelist * pagelist,int num_fcntl_locks,int num_flock_locks)491  int ceph_locks_to_pagelist(struct ceph_filelock *flocks,
492  			   struct ceph_pagelist *pagelist,
493  			   int num_fcntl_locks, int num_flock_locks)
494  {
495  	int err = 0;
496  	__le32 nlocks;
497  
498  	nlocks = cpu_to_le32(num_fcntl_locks);
499  	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
500  	if (err)
501  		goto out_fail;
502  
503  	if (num_fcntl_locks > 0) {
504  		err = ceph_pagelist_append(pagelist, flocks,
505  					   num_fcntl_locks * sizeof(*flocks));
506  		if (err)
507  			goto out_fail;
508  	}
509  
510  	nlocks = cpu_to_le32(num_flock_locks);
511  	err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks));
512  	if (err)
513  		goto out_fail;
514  
515  	if (num_flock_locks > 0) {
516  		err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks],
517  					   num_flock_locks * sizeof(*flocks));
518  	}
519  out_fail:
520  	return err;
521  }
522