1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21
22 #include "cmd.h"
23
24 /* Device specification max LOAD size */
25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26
27 #define MAX_CHUNK_SIZE SZ_8M
28
mlx5vf_drvdata(struct pci_dev * pdev)29 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
30 {
31 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
32
33 return container_of(core_device, struct mlx5vf_pci_core_device,
34 core_device);
35 }
36
37 struct page *
mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer * buf,unsigned long offset)38 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
39 unsigned long offset)
40 {
41 unsigned long cur_offset = 0;
42 struct scatterlist *sg;
43 unsigned int i;
44
45 /* All accesses are sequential */
46 if (offset < buf->last_offset || !buf->last_offset_sg) {
47 buf->last_offset = 0;
48 buf->last_offset_sg = buf->table.sgt.sgl;
49 buf->sg_last_entry = 0;
50 }
51
52 cur_offset = buf->last_offset;
53
54 for_each_sg(buf->last_offset_sg, sg,
55 buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
56 if (offset < sg->length + cur_offset) {
57 buf->last_offset_sg = sg;
58 buf->sg_last_entry += i;
59 buf->last_offset = cur_offset;
60 return nth_page(sg_page(sg),
61 (offset - cur_offset) / PAGE_SIZE);
62 }
63 cur_offset += sg->length;
64 }
65 return NULL;
66 }
67
mlx5vf_disable_fd(struct mlx5_vf_migration_file * migf)68 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
69 {
70 mutex_lock(&migf->lock);
71 migf->state = MLX5_MIGF_STATE_ERROR;
72 migf->filp->f_pos = 0;
73 mutex_unlock(&migf->lock);
74 }
75
mlx5vf_release_file(struct inode * inode,struct file * filp)76 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
77 {
78 struct mlx5_vf_migration_file *migf = filp->private_data;
79
80 mlx5vf_disable_fd(migf);
81 mutex_destroy(&migf->lock);
82 kfree(migf);
83 return 0;
84 }
85
86 static struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file * migf,loff_t pos,bool * end_of_data)87 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
88 bool *end_of_data)
89 {
90 struct mlx5_vhca_data_buffer *buf;
91 bool found = false;
92
93 *end_of_data = false;
94 spin_lock_irq(&migf->list_lock);
95 if (list_empty(&migf->buf_list)) {
96 *end_of_data = true;
97 goto end;
98 }
99
100 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
101 buf_elm);
102 if (pos >= buf->start_pos &&
103 pos < buf->start_pos + buf->length) {
104 found = true;
105 goto end;
106 }
107
108 /*
109 * As we use a stream based FD we may expect having the data always
110 * on first chunk
111 */
112 migf->state = MLX5_MIGF_STATE_ERROR;
113
114 end:
115 spin_unlock_irq(&migf->list_lock);
116 return found ? buf : NULL;
117 }
118
mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer * vhca_buf)119 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
120 {
121 struct mlx5_vf_migration_file *migf = vhca_buf->migf;
122
123 if (vhca_buf->stop_copy_chunk_num) {
124 bool is_header = vhca_buf->dma_dir == DMA_NONE;
125 u8 chunk_num = vhca_buf->stop_copy_chunk_num;
126 size_t next_required_umem_size = 0;
127
128 if (is_header)
129 migf->buf_header[chunk_num - 1] = vhca_buf;
130 else
131 migf->buf[chunk_num - 1] = vhca_buf;
132
133 spin_lock_irq(&migf->list_lock);
134 list_del_init(&vhca_buf->buf_elm);
135 if (!is_header) {
136 next_required_umem_size =
137 migf->next_required_umem_size;
138 migf->next_required_umem_size = 0;
139 migf->num_ready_chunks--;
140 }
141 spin_unlock_irq(&migf->list_lock);
142 if (next_required_umem_size)
143 mlx5vf_mig_file_set_save_work(migf, chunk_num,
144 next_required_umem_size);
145 return;
146 }
147
148 spin_lock_irq(&migf->list_lock);
149 list_del_init(&vhca_buf->buf_elm);
150 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
151 spin_unlock_irq(&migf->list_lock);
152 }
153
mlx5vf_buf_read(struct mlx5_vhca_data_buffer * vhca_buf,char __user ** buf,size_t * len,loff_t * pos)154 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
155 char __user **buf, size_t *len, loff_t *pos)
156 {
157 unsigned long offset;
158 ssize_t done = 0;
159 size_t copy_len;
160
161 copy_len = min_t(size_t,
162 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
163 while (copy_len) {
164 size_t page_offset;
165 struct page *page;
166 size_t page_len;
167 u8 *from_buff;
168 int ret;
169
170 offset = *pos - vhca_buf->start_pos;
171 page_offset = offset % PAGE_SIZE;
172 offset -= page_offset;
173 page = mlx5vf_get_migration_page(vhca_buf, offset);
174 if (!page)
175 return -EINVAL;
176 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
177 from_buff = kmap_local_page(page);
178 ret = copy_to_user(*buf, from_buff + page_offset, page_len);
179 kunmap_local(from_buff);
180 if (ret)
181 return -EFAULT;
182 *pos += page_len;
183 *len -= page_len;
184 *buf += page_len;
185 done += page_len;
186 copy_len -= page_len;
187 }
188
189 if (*pos >= vhca_buf->start_pos + vhca_buf->length)
190 mlx5vf_buf_read_done(vhca_buf);
191
192 return done;
193 }
194
mlx5vf_save_read(struct file * filp,char __user * buf,size_t len,loff_t * pos)195 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
196 loff_t *pos)
197 {
198 struct mlx5_vf_migration_file *migf = filp->private_data;
199 struct mlx5_vhca_data_buffer *vhca_buf;
200 bool first_loop_call = true;
201 bool end_of_data;
202 ssize_t done = 0;
203
204 if (pos)
205 return -ESPIPE;
206 pos = &filp->f_pos;
207
208 if (!(filp->f_flags & O_NONBLOCK)) {
209 if (wait_event_interruptible(migf->poll_wait,
210 !list_empty(&migf->buf_list) ||
211 migf->state == MLX5_MIGF_STATE_ERROR ||
212 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
213 migf->state == MLX5_MIGF_STATE_PRE_COPY ||
214 migf->state == MLX5_MIGF_STATE_COMPLETE))
215 return -ERESTARTSYS;
216 }
217
218 mutex_lock(&migf->lock);
219 if (migf->state == MLX5_MIGF_STATE_ERROR) {
220 done = -ENODEV;
221 goto out_unlock;
222 }
223
224 while (len) {
225 ssize_t count;
226
227 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
228 &end_of_data);
229 if (first_loop_call) {
230 first_loop_call = false;
231 /* Temporary end of file as part of PRE_COPY */
232 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
233 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
234 done = -ENOMSG;
235 goto out_unlock;
236 }
237
238 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
239 if (filp->f_flags & O_NONBLOCK) {
240 done = -EAGAIN;
241 goto out_unlock;
242 }
243 }
244 }
245
246 if (end_of_data)
247 goto out_unlock;
248
249 if (!vhca_buf) {
250 done = -EINVAL;
251 goto out_unlock;
252 }
253
254 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
255 if (count < 0) {
256 done = count;
257 goto out_unlock;
258 }
259 done += count;
260 }
261
262 out_unlock:
263 mutex_unlock(&migf->lock);
264 return done;
265 }
266
mlx5vf_save_poll(struct file * filp,struct poll_table_struct * wait)267 static __poll_t mlx5vf_save_poll(struct file *filp,
268 struct poll_table_struct *wait)
269 {
270 struct mlx5_vf_migration_file *migf = filp->private_data;
271 __poll_t pollflags = 0;
272
273 poll_wait(filp, &migf->poll_wait, wait);
274
275 mutex_lock(&migf->lock);
276 if (migf->state == MLX5_MIGF_STATE_ERROR)
277 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
278 else if (!list_empty(&migf->buf_list) ||
279 migf->state == MLX5_MIGF_STATE_COMPLETE)
280 pollflags = EPOLLIN | EPOLLRDNORM;
281 mutex_unlock(&migf->lock);
282
283 return pollflags;
284 }
285
286 /*
287 * FD is exposed and user can use it after receiving an error.
288 * Mark migf in error, and wake the user.
289 */
mlx5vf_mark_err(struct mlx5_vf_migration_file * migf)290 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
291 {
292 migf->state = MLX5_MIGF_STATE_ERROR;
293 wake_up_interruptible(&migf->poll_wait);
294 }
295
mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file * migf,u8 chunk_num,size_t next_required_umem_size)296 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
297 u8 chunk_num, size_t next_required_umem_size)
298 {
299 migf->save_data[chunk_num - 1].next_required_umem_size =
300 next_required_umem_size;
301 migf->save_data[chunk_num - 1].migf = migf;
302 get_file(migf->filp);
303 queue_work(migf->mvdev->cb_wq,
304 &migf->save_data[chunk_num - 1].work);
305 }
306
307 static struct mlx5_vhca_data_buffer *
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file * migf,u8 index,size_t required_length)308 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
309 u8 index, size_t required_length)
310 {
311 struct mlx5_vhca_data_buffer *buf = migf->buf[index];
312 u8 chunk_num;
313
314 WARN_ON(!buf);
315 chunk_num = buf->stop_copy_chunk_num;
316 buf->migf->buf[index] = NULL;
317 /* Checking whether the pre-allocated buffer can fit */
318 if (buf->allocated_length >= required_length)
319 return buf;
320
321 mlx5vf_put_data_buffer(buf);
322 buf = mlx5vf_get_data_buffer(buf->migf, required_length,
323 DMA_FROM_DEVICE);
324 if (IS_ERR(buf))
325 return buf;
326
327 buf->stop_copy_chunk_num = chunk_num;
328 return buf;
329 }
330
mlx5vf_mig_file_save_work(struct work_struct * _work)331 static void mlx5vf_mig_file_save_work(struct work_struct *_work)
332 {
333 struct mlx5vf_save_work_data *save_data = container_of(_work,
334 struct mlx5vf_save_work_data, work);
335 struct mlx5_vf_migration_file *migf = save_data->migf;
336 struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
337 struct mlx5_vhca_data_buffer *buf;
338
339 mutex_lock(&mvdev->state_mutex);
340 if (migf->state == MLX5_MIGF_STATE_ERROR)
341 goto end;
342
343 buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
344 save_data->chunk_num - 1,
345 save_data->next_required_umem_size);
346 if (IS_ERR(buf))
347 goto err;
348
349 if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
350 goto err_save;
351
352 goto end;
353
354 err_save:
355 mlx5vf_put_data_buffer(buf);
356 err:
357 mlx5vf_mark_err(migf);
358 end:
359 mlx5vf_state_mutex_unlock(mvdev);
360 fput(migf->filp);
361 }
362
mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file * migf,bool track)363 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
364 bool track)
365 {
366 size_t size = sizeof(struct mlx5_vf_migration_header) +
367 sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
368 struct mlx5_vf_migration_tag_stop_copy_data data = {};
369 struct mlx5_vhca_data_buffer *header_buf = NULL;
370 struct mlx5_vf_migration_header header = {};
371 unsigned long flags;
372 struct page *page;
373 u8 *to_buff;
374 int ret;
375
376 header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
377 if (IS_ERR(header_buf))
378 return PTR_ERR(header_buf);
379
380 header.record_size = cpu_to_le64(sizeof(data));
381 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
382 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
383 page = mlx5vf_get_migration_page(header_buf, 0);
384 if (!page) {
385 ret = -EINVAL;
386 goto err;
387 }
388 to_buff = kmap_local_page(page);
389 memcpy(to_buff, &header, sizeof(header));
390 header_buf->length = sizeof(header);
391 data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
392 memcpy(to_buff + sizeof(header), &data, sizeof(data));
393 header_buf->length += sizeof(data);
394 kunmap_local(to_buff);
395 header_buf->start_pos = header_buf->migf->max_pos;
396 migf->max_pos += header_buf->length;
397 spin_lock_irqsave(&migf->list_lock, flags);
398 list_add_tail(&header_buf->buf_elm, &migf->buf_list);
399 spin_unlock_irqrestore(&migf->list_lock, flags);
400 if (track)
401 migf->pre_copy_initial_bytes = size;
402 return 0;
403 err:
404 mlx5vf_put_data_buffer(header_buf);
405 return ret;
406 }
407
mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device * mvdev,struct mlx5_vf_migration_file * migf,size_t state_size,u64 full_size,bool track)408 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
409 struct mlx5_vf_migration_file *migf,
410 size_t state_size, u64 full_size,
411 bool track)
412 {
413 struct mlx5_vhca_data_buffer *buf;
414 size_t inc_state_size;
415 int num_chunks;
416 int ret;
417 int i;
418
419 if (mvdev->chunk_mode) {
420 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
421
422 /* from firmware perspective at least 'state_size' buffer should be set */
423 inc_state_size = max(state_size, chunk_size);
424 } else {
425 if (track) {
426 /* let's be ready for stop_copy size that might grow by 10 percents */
427 if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
428 inc_state_size = state_size;
429 } else {
430 inc_state_size = state_size;
431 }
432 }
433
434 /* let's not overflow the device specification max SAVE size */
435 inc_state_size = min_t(size_t, inc_state_size,
436 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
437
438 num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
439 for (i = 0; i < num_chunks; i++) {
440 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
441 if (IS_ERR(buf)) {
442 ret = PTR_ERR(buf);
443 goto err;
444 }
445
446 migf->buf[i] = buf;
447 buf = mlx5vf_get_data_buffer(migf,
448 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
449 if (IS_ERR(buf)) {
450 ret = PTR_ERR(buf);
451 goto err;
452 }
453 migf->buf_header[i] = buf;
454 if (mvdev->chunk_mode) {
455 migf->buf[i]->stop_copy_chunk_num = i + 1;
456 migf->buf_header[i]->stop_copy_chunk_num = i + 1;
457 INIT_WORK(&migf->save_data[i].work,
458 mlx5vf_mig_file_save_work);
459 migf->save_data[i].chunk_num = i + 1;
460 }
461 }
462
463 ret = mlx5vf_add_stop_copy_header(migf, track);
464 if (ret)
465 goto err;
466 return 0;
467
468 err:
469 for (i = 0; i < num_chunks; i++) {
470 if (migf->buf[i]) {
471 mlx5vf_put_data_buffer(migf->buf[i]);
472 migf->buf[i] = NULL;
473 }
474 if (migf->buf_header[i]) {
475 mlx5vf_put_data_buffer(migf->buf_header[i]);
476 migf->buf_header[i] = NULL;
477 }
478 }
479
480 return ret;
481 }
482
mlx5vf_precopy_ioctl(struct file * filp,unsigned int cmd,unsigned long arg)483 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
484 unsigned long arg)
485 {
486 struct mlx5_vf_migration_file *migf = filp->private_data;
487 struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
488 struct mlx5_vhca_data_buffer *buf;
489 struct vfio_precopy_info info = {};
490 loff_t *pos = &filp->f_pos;
491 unsigned long minsz;
492 size_t inc_length = 0;
493 bool end_of_data = false;
494 int ret;
495
496 if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
497 return -ENOTTY;
498
499 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
500
501 if (copy_from_user(&info, (void __user *)arg, minsz))
502 return -EFAULT;
503
504 if (info.argsz < minsz)
505 return -EINVAL;
506
507 mutex_lock(&mvdev->state_mutex);
508 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
509 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
510 ret = -EINVAL;
511 goto err_state_unlock;
512 }
513
514 /*
515 * We can't issue a SAVE command when the device is suspended, so as
516 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
517 * bytes that can't be read.
518 */
519 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
520 /*
521 * Once the query returns it's guaranteed that there is no
522 * active SAVE command.
523 * As so, the other code below is safe with the proper locks.
524 */
525 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
526 NULL, MLX5VF_QUERY_INC);
527 if (ret)
528 goto err_state_unlock;
529 }
530
531 mutex_lock(&migf->lock);
532 if (migf->state == MLX5_MIGF_STATE_ERROR) {
533 ret = -ENODEV;
534 goto err_migf_unlock;
535 }
536
537 if (migf->pre_copy_initial_bytes > *pos) {
538 info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
539 } else {
540 info.dirty_bytes = migf->max_pos - *pos;
541 if (!info.dirty_bytes)
542 end_of_data = true;
543 info.dirty_bytes += inc_length;
544 }
545
546 if (!end_of_data || !inc_length) {
547 mutex_unlock(&migf->lock);
548 goto done;
549 }
550
551 mutex_unlock(&migf->lock);
552 /*
553 * We finished transferring the current state and the device has a
554 * dirty state, save a new state to be ready for.
555 */
556 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
557 if (IS_ERR(buf)) {
558 ret = PTR_ERR(buf);
559 mlx5vf_mark_err(migf);
560 goto err_state_unlock;
561 }
562
563 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
564 if (ret) {
565 mlx5vf_mark_err(migf);
566 mlx5vf_put_data_buffer(buf);
567 goto err_state_unlock;
568 }
569
570 done:
571 mlx5vf_state_mutex_unlock(mvdev);
572 if (copy_to_user((void __user *)arg, &info, minsz))
573 return -EFAULT;
574 return 0;
575
576 err_migf_unlock:
577 mutex_unlock(&migf->lock);
578 err_state_unlock:
579 mlx5vf_state_mutex_unlock(mvdev);
580 return ret;
581 }
582
583 static const struct file_operations mlx5vf_save_fops = {
584 .owner = THIS_MODULE,
585 .read = mlx5vf_save_read,
586 .poll = mlx5vf_save_poll,
587 .unlocked_ioctl = mlx5vf_precopy_ioctl,
588 .compat_ioctl = compat_ptr_ioctl,
589 .release = mlx5vf_release_file,
590 };
591
mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device * mvdev)592 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
593 {
594 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
595 struct mlx5_vhca_data_buffer *buf;
596 size_t length;
597 int ret;
598
599 if (migf->state == MLX5_MIGF_STATE_ERROR)
600 return -ENODEV;
601
602 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
603 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
604 if (ret)
605 goto err;
606
607 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
608 if (IS_ERR(buf)) {
609 ret = PTR_ERR(buf);
610 goto err;
611 }
612
613 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
614 if (ret)
615 goto err_save;
616
617 return 0;
618
619 err_save:
620 mlx5vf_put_data_buffer(buf);
621 err:
622 mlx5vf_mark_err(migf);
623 return ret;
624 }
625
626 static struct mlx5_vf_migration_file *
mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device * mvdev,bool track)627 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
628 {
629 struct mlx5_vf_migration_file *migf;
630 struct mlx5_vhca_data_buffer *buf;
631 size_t length;
632 u64 full_size;
633 int ret;
634
635 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
636 if (!migf)
637 return ERR_PTR(-ENOMEM);
638
639 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
640 O_RDONLY);
641 if (IS_ERR(migf->filp)) {
642 ret = PTR_ERR(migf->filp);
643 goto end;
644 }
645
646 migf->mvdev = mvdev;
647 ret = mlx5vf_cmd_alloc_pd(migf);
648 if (ret)
649 goto out_free;
650
651 stream_open(migf->filp->f_inode, migf->filp);
652 mutex_init(&migf->lock);
653 init_waitqueue_head(&migf->poll_wait);
654 init_completion(&migf->save_comp);
655 /*
656 * save_comp is being used as a binary semaphore built from
657 * a completion. A normal mutex cannot be used because the lock is
658 * passed between kernel threads and lockdep can't model this.
659 */
660 complete(&migf->save_comp);
661 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
662 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
663 INIT_LIST_HEAD(&migf->buf_list);
664 INIT_LIST_HEAD(&migf->avail_list);
665 spin_lock_init(&migf->list_lock);
666 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
667 if (ret)
668 goto out_pd;
669
670 ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
671 if (ret)
672 goto out_pd;
673
674 if (track) {
675 /* leave the allocated buffer ready for the stop-copy phase */
676 buf = mlx5vf_alloc_data_buffer(migf,
677 migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
678 if (IS_ERR(buf)) {
679 ret = PTR_ERR(buf);
680 goto out_pd;
681 }
682 } else {
683 buf = migf->buf[0];
684 migf->buf[0] = NULL;
685 }
686
687 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
688 if (ret)
689 goto out_save;
690 return migf;
691 out_save:
692 mlx5vf_free_data_buffer(buf);
693 out_pd:
694 mlx5fv_cmd_clean_migf_resources(migf);
695 out_free:
696 fput(migf->filp);
697 end:
698 kfree(migf);
699 return ERR_PTR(ret);
700 }
701
702 static int
mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)703 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
704 const char __user **buf, size_t *len,
705 loff_t *pos, ssize_t *done)
706 {
707 unsigned long offset;
708 size_t page_offset;
709 struct page *page;
710 size_t page_len;
711 u8 *to_buff;
712 int ret;
713
714 offset = *pos - vhca_buf->start_pos;
715 page_offset = offset % PAGE_SIZE;
716
717 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
718 if (!page)
719 return -EINVAL;
720 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
721 to_buff = kmap_local_page(page);
722 ret = copy_from_user(to_buff + page_offset, *buf, page_len);
723 kunmap_local(to_buff);
724 if (ret)
725 return -EFAULT;
726
727 *pos += page_len;
728 *done += page_len;
729 *buf += page_len;
730 *len -= page_len;
731 vhca_buf->length += page_len;
732 return 0;
733 }
734
735 static ssize_t
mlx5vf_resume_read_image(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,size_t image_size,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)736 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
737 struct mlx5_vhca_data_buffer *vhca_buf,
738 size_t image_size, const char __user **buf,
739 size_t *len, loff_t *pos, ssize_t *done,
740 bool *has_work)
741 {
742 size_t copy_len, to_copy;
743 int ret;
744
745 to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
746 copy_len = to_copy;
747 while (to_copy) {
748 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
749 done);
750 if (ret)
751 return ret;
752 }
753
754 *len -= copy_len;
755 if (vhca_buf->length == image_size) {
756 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
757 migf->max_pos += image_size;
758 *has_work = true;
759 }
760
761 return 0;
762 }
763
764 static int
mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)765 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
766 struct mlx5_vhca_data_buffer *vhca_buf,
767 const char __user **buf, size_t *len,
768 loff_t *pos, ssize_t *done)
769 {
770 size_t copy_len, to_copy;
771 size_t required_data;
772 u8 *to_buff;
773 int ret;
774
775 required_data = migf->record_size - vhca_buf->length;
776 to_copy = min_t(size_t, *len, required_data);
777 copy_len = to_copy;
778 while (to_copy) {
779 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
780 done);
781 if (ret)
782 return ret;
783 }
784
785 *len -= copy_len;
786 if (vhca_buf->length == migf->record_size) {
787 switch (migf->record_tag) {
788 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
789 {
790 struct page *page;
791
792 page = mlx5vf_get_migration_page(vhca_buf, 0);
793 if (!page)
794 return -EINVAL;
795 to_buff = kmap_local_page(page);
796 migf->stop_copy_prep_size = min_t(u64,
797 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
798 kunmap_local(to_buff);
799 break;
800 }
801 default:
802 /* Optional tag */
803 break;
804 }
805
806 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
807 migf->max_pos += migf->record_size;
808 vhca_buf->length = 0;
809 }
810
811 return 0;
812 }
813
814 static int
mlx5vf_resume_read_header(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)815 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
816 struct mlx5_vhca_data_buffer *vhca_buf,
817 const char __user **buf,
818 size_t *len, loff_t *pos,
819 ssize_t *done, bool *has_work)
820 {
821 struct page *page;
822 size_t copy_len;
823 u8 *to_buff;
824 int ret;
825
826 copy_len = min_t(size_t, *len,
827 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
828 page = mlx5vf_get_migration_page(vhca_buf, 0);
829 if (!page)
830 return -EINVAL;
831 to_buff = kmap_local_page(page);
832 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
833 if (ret) {
834 ret = -EFAULT;
835 goto end;
836 }
837
838 *buf += copy_len;
839 *pos += copy_len;
840 *done += copy_len;
841 *len -= copy_len;
842 vhca_buf->length += copy_len;
843 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
844 u64 record_size;
845 u32 flags;
846
847 record_size = le64_to_cpup((__le64 *)to_buff);
848 if (record_size > MAX_LOAD_SIZE) {
849 ret = -ENOMEM;
850 goto end;
851 }
852
853 migf->record_size = record_size;
854 flags = le32_to_cpup((__le32 *)(to_buff +
855 offsetof(struct mlx5_vf_migration_header, flags)));
856 migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
857 offsetof(struct mlx5_vf_migration_header, tag)));
858 switch (migf->record_tag) {
859 case MLX5_MIGF_HEADER_TAG_FW_DATA:
860 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
861 break;
862 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
863 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
864 break;
865 default:
866 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
867 ret = -EOPNOTSUPP;
868 goto end;
869 }
870 /* We may read and skip this optional record data */
871 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
872 }
873
874 migf->max_pos += vhca_buf->length;
875 vhca_buf->length = 0;
876 *has_work = true;
877 }
878 end:
879 kunmap_local(to_buff);
880 return ret;
881 }
882
mlx5vf_resume_write(struct file * filp,const char __user * buf,size_t len,loff_t * pos)883 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
884 size_t len, loff_t *pos)
885 {
886 struct mlx5_vf_migration_file *migf = filp->private_data;
887 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
888 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
889 loff_t requested_length;
890 bool has_work = false;
891 ssize_t done = 0;
892 int ret = 0;
893
894 if (pos)
895 return -ESPIPE;
896 pos = &filp->f_pos;
897
898 if (*pos < 0 ||
899 check_add_overflow((loff_t)len, *pos, &requested_length))
900 return -EINVAL;
901
902 mutex_lock(&migf->mvdev->state_mutex);
903 mutex_lock(&migf->lock);
904 if (migf->state == MLX5_MIGF_STATE_ERROR) {
905 ret = -ENODEV;
906 goto out_unlock;
907 }
908
909 while (len || has_work) {
910 has_work = false;
911 switch (migf->load_state) {
912 case MLX5_VF_LOAD_STATE_READ_HEADER:
913 ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
914 &buf, &len, pos,
915 &done, &has_work);
916 if (ret)
917 goto out_unlock;
918 break;
919 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
920 if (vhca_buf_header->allocated_length < migf->record_size) {
921 mlx5vf_free_data_buffer(vhca_buf_header);
922
923 migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
924 migf->record_size, DMA_NONE);
925 if (IS_ERR(migf->buf_header[0])) {
926 ret = PTR_ERR(migf->buf_header[0]);
927 migf->buf_header[0] = NULL;
928 goto out_unlock;
929 }
930
931 vhca_buf_header = migf->buf_header[0];
932 }
933
934 vhca_buf_header->start_pos = migf->max_pos;
935 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
936 break;
937 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
938 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
939 &buf, &len, pos, &done);
940 if (ret)
941 goto out_unlock;
942 break;
943 case MLX5_VF_LOAD_STATE_PREP_IMAGE:
944 {
945 u64 size = max(migf->record_size,
946 migf->stop_copy_prep_size);
947
948 if (vhca_buf->allocated_length < size) {
949 mlx5vf_free_data_buffer(vhca_buf);
950
951 migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
952 size, DMA_TO_DEVICE);
953 if (IS_ERR(migf->buf[0])) {
954 ret = PTR_ERR(migf->buf[0]);
955 migf->buf[0] = NULL;
956 goto out_unlock;
957 }
958
959 vhca_buf = migf->buf[0];
960 }
961
962 vhca_buf->start_pos = migf->max_pos;
963 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
964 break;
965 }
966 case MLX5_VF_LOAD_STATE_READ_IMAGE:
967 ret = mlx5vf_resume_read_image(migf, vhca_buf,
968 migf->record_size,
969 &buf, &len, pos, &done, &has_work);
970 if (ret)
971 goto out_unlock;
972 break;
973 case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
974 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
975 if (ret)
976 goto out_unlock;
977 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
978
979 /* prep header buf for next image */
980 vhca_buf_header->length = 0;
981 /* prep data buf for next image */
982 vhca_buf->length = 0;
983
984 break;
985 default:
986 break;
987 }
988 }
989
990 out_unlock:
991 if (ret)
992 migf->state = MLX5_MIGF_STATE_ERROR;
993 mutex_unlock(&migf->lock);
994 mlx5vf_state_mutex_unlock(migf->mvdev);
995 return ret ? ret : done;
996 }
997
998 static const struct file_operations mlx5vf_resume_fops = {
999 .owner = THIS_MODULE,
1000 .write = mlx5vf_resume_write,
1001 .release = mlx5vf_release_file,
1002 };
1003
1004 static struct mlx5_vf_migration_file *
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device * mvdev)1005 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
1006 {
1007 struct mlx5_vf_migration_file *migf;
1008 struct mlx5_vhca_data_buffer *buf;
1009 int ret;
1010
1011 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
1012 if (!migf)
1013 return ERR_PTR(-ENOMEM);
1014
1015 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
1016 O_WRONLY);
1017 if (IS_ERR(migf->filp)) {
1018 ret = PTR_ERR(migf->filp);
1019 goto end;
1020 }
1021
1022 migf->mvdev = mvdev;
1023 ret = mlx5vf_cmd_alloc_pd(migf);
1024 if (ret)
1025 goto out_free;
1026
1027 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
1028 if (IS_ERR(buf)) {
1029 ret = PTR_ERR(buf);
1030 goto out_pd;
1031 }
1032
1033 migf->buf[0] = buf;
1034 buf = mlx5vf_alloc_data_buffer(migf,
1035 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
1036 if (IS_ERR(buf)) {
1037 ret = PTR_ERR(buf);
1038 goto out_buf;
1039 }
1040
1041 migf->buf_header[0] = buf;
1042 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
1043
1044 stream_open(migf->filp->f_inode, migf->filp);
1045 mutex_init(&migf->lock);
1046 INIT_LIST_HEAD(&migf->buf_list);
1047 INIT_LIST_HEAD(&migf->avail_list);
1048 spin_lock_init(&migf->list_lock);
1049 return migf;
1050 out_buf:
1051 mlx5vf_free_data_buffer(migf->buf[0]);
1052 out_pd:
1053 mlx5vf_cmd_dealloc_pd(migf);
1054 out_free:
1055 fput(migf->filp);
1056 end:
1057 kfree(migf);
1058 return ERR_PTR(ret);
1059 }
1060
mlx5vf_disable_fds(struct mlx5vf_pci_core_device * mvdev,enum mlx5_vf_migf_state * last_save_state)1061 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
1062 enum mlx5_vf_migf_state *last_save_state)
1063 {
1064 if (mvdev->resuming_migf) {
1065 mlx5vf_disable_fd(mvdev->resuming_migf);
1066 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1067 fput(mvdev->resuming_migf->filp);
1068 mvdev->resuming_migf = NULL;
1069 }
1070 if (mvdev->saving_migf) {
1071 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1072 cancel_work_sync(&mvdev->saving_migf->async_data.work);
1073 if (last_save_state)
1074 *last_save_state = mvdev->saving_migf->state;
1075 mlx5vf_disable_fd(mvdev->saving_migf);
1076 wake_up_interruptible(&mvdev->saving_migf->poll_wait);
1077 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1078 fput(mvdev->saving_migf->filp);
1079 mvdev->saving_migf = NULL;
1080 }
1081 }
1082
1083 static struct file *
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device * mvdev,u32 new)1084 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1085 u32 new)
1086 {
1087 u32 cur = mvdev->mig_state;
1088 int ret;
1089
1090 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1091 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1092 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1093 if (ret)
1094 return ERR_PTR(ret);
1095 return NULL;
1096 }
1097
1098 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1099 ret = mlx5vf_cmd_resume_vhca(mvdev,
1100 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1101 if (ret)
1102 return ERR_PTR(ret);
1103 return NULL;
1104 }
1105
1106 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1107 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1108 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1109 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1110 if (ret)
1111 return ERR_PTR(ret);
1112 return NULL;
1113 }
1114
1115 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1116 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1117 ret = mlx5vf_cmd_resume_vhca(mvdev,
1118 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1119 if (ret)
1120 return ERR_PTR(ret);
1121 return NULL;
1122 }
1123
1124 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1125 struct mlx5_vf_migration_file *migf;
1126
1127 migf = mlx5vf_pci_save_device_data(mvdev, false);
1128 if (IS_ERR(migf))
1129 return ERR_CAST(migf);
1130 get_file(migf->filp);
1131 mvdev->saving_migf = migf;
1132 return migf->filp;
1133 }
1134
1135 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
1136 mlx5vf_disable_fds(mvdev, NULL);
1137 return NULL;
1138 }
1139
1140 if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1141 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1142 new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1143 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
1144 struct mlx5_vhca_data_buffer *buf;
1145 enum mlx5_vf_migf_state state;
1146 size_t size;
1147
1148 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
1149 MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
1150 if (ret)
1151 return ERR_PTR(ret);
1152 buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
1153 if (IS_ERR(buf))
1154 return ERR_CAST(buf);
1155 /* pre_copy cleanup */
1156 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
1157 if (ret) {
1158 mlx5vf_put_data_buffer(buf);
1159 return ERR_PTR(ret);
1160 }
1161 mlx5vf_disable_fds(mvdev, &state);
1162 return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
1163 }
1164
1165 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1166 struct mlx5_vf_migration_file *migf;
1167
1168 migf = mlx5vf_pci_resume_device_data(mvdev);
1169 if (IS_ERR(migf))
1170 return ERR_CAST(migf);
1171 get_file(migf->filp);
1172 mvdev->resuming_migf = migf;
1173 return migf->filp;
1174 }
1175
1176 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1177 mlx5vf_disable_fds(mvdev, NULL);
1178 return NULL;
1179 }
1180
1181 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1182 (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1183 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1184 struct mlx5_vf_migration_file *migf;
1185
1186 migf = mlx5vf_pci_save_device_data(mvdev, true);
1187 if (IS_ERR(migf))
1188 return ERR_CAST(migf);
1189 get_file(migf->filp);
1190 mvdev->saving_migf = migf;
1191 return migf->filp;
1192 }
1193
1194 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1195 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1196 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1197 if (ret)
1198 return ERR_PTR(ret);
1199 ret = mlx5vf_pci_save_device_inc_data(mvdev);
1200 return ret ? ERR_PTR(ret) : NULL;
1201 }
1202
1203 /*
1204 * vfio_mig_get_next_state() does not use arcs other than the above
1205 */
1206 WARN_ON(true);
1207 return ERR_PTR(-EINVAL);
1208 }
1209
1210 /*
1211 * This function is called in all state_mutex unlock cases to
1212 * handle a 'deferred_reset' if exists.
1213 */
mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device * mvdev)1214 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1215 {
1216 again:
1217 spin_lock(&mvdev->reset_lock);
1218 if (mvdev->deferred_reset) {
1219 mvdev->deferred_reset = false;
1220 spin_unlock(&mvdev->reset_lock);
1221 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1222 mlx5vf_disable_fds(mvdev, NULL);
1223 goto again;
1224 }
1225 mutex_unlock(&mvdev->state_mutex);
1226 spin_unlock(&mvdev->reset_lock);
1227 }
1228
1229 static struct file *
mlx5vf_pci_set_device_state(struct vfio_device * vdev,enum vfio_device_mig_state new_state)1230 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1231 enum vfio_device_mig_state new_state)
1232 {
1233 struct mlx5vf_pci_core_device *mvdev = container_of(
1234 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1235 enum vfio_device_mig_state next_state;
1236 struct file *res = NULL;
1237 int ret;
1238
1239 mutex_lock(&mvdev->state_mutex);
1240 while (new_state != mvdev->mig_state) {
1241 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1242 new_state, &next_state);
1243 if (ret) {
1244 res = ERR_PTR(ret);
1245 break;
1246 }
1247 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1248 if (IS_ERR(res))
1249 break;
1250 mvdev->mig_state = next_state;
1251 if (WARN_ON(res && new_state != mvdev->mig_state)) {
1252 fput(res);
1253 res = ERR_PTR(-EINVAL);
1254 break;
1255 }
1256 }
1257 mlx5vf_state_mutex_unlock(mvdev);
1258 return res;
1259 }
1260
mlx5vf_pci_get_data_size(struct vfio_device * vdev,unsigned long * stop_copy_length)1261 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1262 unsigned long *stop_copy_length)
1263 {
1264 struct mlx5vf_pci_core_device *mvdev = container_of(
1265 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1266 size_t state_size;
1267 u64 total_size;
1268 int ret;
1269
1270 mutex_lock(&mvdev->state_mutex);
1271 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
1272 &total_size, 0);
1273 if (!ret)
1274 *stop_copy_length = total_size;
1275 mlx5vf_state_mutex_unlock(mvdev);
1276 return ret;
1277 }
1278
mlx5vf_pci_get_device_state(struct vfio_device * vdev,enum vfio_device_mig_state * curr_state)1279 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1280 enum vfio_device_mig_state *curr_state)
1281 {
1282 struct mlx5vf_pci_core_device *mvdev = container_of(
1283 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1284
1285 mutex_lock(&mvdev->state_mutex);
1286 *curr_state = mvdev->mig_state;
1287 mlx5vf_state_mutex_unlock(mvdev);
1288 return 0;
1289 }
1290
mlx5vf_pci_aer_reset_done(struct pci_dev * pdev)1291 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1292 {
1293 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1294
1295 if (!mvdev->migrate_cap)
1296 return;
1297
1298 /*
1299 * As the higher VFIO layers are holding locks across reset and using
1300 * those same locks with the mm_lock we need to prevent ABBA deadlock
1301 * with the state_mutex and mm_lock.
1302 * In case the state_mutex was taken already we defer the cleanup work
1303 * to the unlock flow of the other running context.
1304 */
1305 spin_lock(&mvdev->reset_lock);
1306 mvdev->deferred_reset = true;
1307 if (!mutex_trylock(&mvdev->state_mutex)) {
1308 spin_unlock(&mvdev->reset_lock);
1309 return;
1310 }
1311 spin_unlock(&mvdev->reset_lock);
1312 mlx5vf_state_mutex_unlock(mvdev);
1313 }
1314
mlx5vf_pci_open_device(struct vfio_device * core_vdev)1315 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1316 {
1317 struct mlx5vf_pci_core_device *mvdev = container_of(
1318 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1319 struct vfio_pci_core_device *vdev = &mvdev->core_device;
1320 int ret;
1321
1322 ret = vfio_pci_core_enable(vdev);
1323 if (ret)
1324 return ret;
1325
1326 if (mvdev->migrate_cap)
1327 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1328 vfio_pci_core_finish_enable(vdev);
1329 return 0;
1330 }
1331
mlx5vf_pci_close_device(struct vfio_device * core_vdev)1332 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1333 {
1334 struct mlx5vf_pci_core_device *mvdev = container_of(
1335 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1336
1337 mlx5vf_cmd_close_migratable(mvdev);
1338 vfio_pci_core_close_device(core_vdev);
1339 }
1340
1341 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1342 .migration_set_state = mlx5vf_pci_set_device_state,
1343 .migration_get_state = mlx5vf_pci_get_device_state,
1344 .migration_get_data_size = mlx5vf_pci_get_data_size,
1345 };
1346
1347 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1348 .log_start = mlx5vf_start_page_tracker,
1349 .log_stop = mlx5vf_stop_page_tracker,
1350 .log_read_and_clear = mlx5vf_tracker_read_and_clear,
1351 };
1352
mlx5vf_pci_init_dev(struct vfio_device * core_vdev)1353 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1354 {
1355 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1356 struct mlx5vf_pci_core_device, core_device.vdev);
1357 int ret;
1358
1359 ret = vfio_pci_core_init_dev(core_vdev);
1360 if (ret)
1361 return ret;
1362
1363 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1364 &mlx5vf_pci_log_ops);
1365
1366 return 0;
1367 }
1368
mlx5vf_pci_release_dev(struct vfio_device * core_vdev)1369 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1370 {
1371 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1372 struct mlx5vf_pci_core_device, core_device.vdev);
1373
1374 mlx5vf_cmd_remove_migratable(mvdev);
1375 vfio_pci_core_release_dev(core_vdev);
1376 }
1377
1378 static const struct vfio_device_ops mlx5vf_pci_ops = {
1379 .name = "mlx5-vfio-pci",
1380 .init = mlx5vf_pci_init_dev,
1381 .release = mlx5vf_pci_release_dev,
1382 .open_device = mlx5vf_pci_open_device,
1383 .close_device = mlx5vf_pci_close_device,
1384 .ioctl = vfio_pci_core_ioctl,
1385 .device_feature = vfio_pci_core_ioctl_feature,
1386 .read = vfio_pci_core_read,
1387 .write = vfio_pci_core_write,
1388 .mmap = vfio_pci_core_mmap,
1389 .request = vfio_pci_core_request,
1390 .match = vfio_pci_core_match,
1391 .bind_iommufd = vfio_iommufd_physical_bind,
1392 .unbind_iommufd = vfio_iommufd_physical_unbind,
1393 .attach_ioas = vfio_iommufd_physical_attach_ioas,
1394 .detach_ioas = vfio_iommufd_physical_detach_ioas,
1395 };
1396
mlx5vf_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)1397 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1398 const struct pci_device_id *id)
1399 {
1400 struct mlx5vf_pci_core_device *mvdev;
1401 int ret;
1402
1403 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1404 &pdev->dev, &mlx5vf_pci_ops);
1405 if (IS_ERR(mvdev))
1406 return PTR_ERR(mvdev);
1407
1408 dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1409 ret = vfio_pci_core_register_device(&mvdev->core_device);
1410 if (ret)
1411 goto out_put_vdev;
1412 return 0;
1413
1414 out_put_vdev:
1415 vfio_put_device(&mvdev->core_device.vdev);
1416 return ret;
1417 }
1418
mlx5vf_pci_remove(struct pci_dev * pdev)1419 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1420 {
1421 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1422
1423 vfio_pci_core_unregister_device(&mvdev->core_device);
1424 vfio_put_device(&mvdev->core_device.vdev);
1425 }
1426
1427 static const struct pci_device_id mlx5vf_pci_table[] = {
1428 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1429 {}
1430 };
1431
1432 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1433
1434 static const struct pci_error_handlers mlx5vf_err_handlers = {
1435 .reset_done = mlx5vf_pci_aer_reset_done,
1436 .error_detected = vfio_pci_core_aer_err_detected,
1437 };
1438
1439 static struct pci_driver mlx5vf_pci_driver = {
1440 .name = KBUILD_MODNAME,
1441 .id_table = mlx5vf_pci_table,
1442 .probe = mlx5vf_pci_probe,
1443 .remove = mlx5vf_pci_remove,
1444 .err_handler = &mlx5vf_err_handlers,
1445 .driver_managed_dma = true,
1446 };
1447
1448 module_pci_driver(mlx5vf_pci_driver);
1449
1450 MODULE_IMPORT_NS(IOMMUFD);
1451 MODULE_LICENSE("GPL");
1452 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1453 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1454 MODULE_DESCRIPTION(
1455 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1456