1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6#include <linux/device.h>
7#include <linux/eventfd.h>
8#include <linux/file.h>
9#include <linux/interrupt.h>
10#include <linux/iommu.h>
11#include <linux/module.h>
12#include <linux/mutex.h>
13#include <linux/notifier.h>
14#include <linux/pci.h>
15#include <linux/pm_runtime.h>
16#include <linux/types.h>
17#include <linux/uaccess.h>
18#include <linux/vfio.h>
19#include <linux/sched/mm.h>
20#include <linux/anon_inodes.h>
21
22#include "cmd.h"
23
24/* Device specification max LOAD size */
25#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26
27#define MAX_CHUNK_SIZE SZ_8M
28
29static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
30{
31	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
32
33	return container_of(core_device, struct mlx5vf_pci_core_device,
34			    core_device);
35}
36
37struct page *
38mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
39			  unsigned long offset)
40{
41	unsigned long cur_offset = 0;
42	struct scatterlist *sg;
43	unsigned int i;
44
45	/* All accesses are sequential */
46	if (offset < buf->last_offset || !buf->last_offset_sg) {
47		buf->last_offset = 0;
48		buf->last_offset_sg = buf->table.sgt.sgl;
49		buf->sg_last_entry = 0;
50	}
51
52	cur_offset = buf->last_offset;
53
54	for_each_sg(buf->last_offset_sg, sg,
55			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
56		if (offset < sg->length + cur_offset) {
57			buf->last_offset_sg = sg;
58			buf->sg_last_entry += i;
59			buf->last_offset = cur_offset;
60			return nth_page(sg_page(sg),
61					(offset - cur_offset) / PAGE_SIZE);
62		}
63		cur_offset += sg->length;
64	}
65	return NULL;
66}
67
68static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
69{
70	mutex_lock(&migf->lock);
71	migf->state = MLX5_MIGF_STATE_ERROR;
72	migf->filp->f_pos = 0;
73	mutex_unlock(&migf->lock);
74}
75
76static int mlx5vf_release_file(struct inode *inode, struct file *filp)
77{
78	struct mlx5_vf_migration_file *migf = filp->private_data;
79
80	mlx5vf_disable_fd(migf);
81	mutex_destroy(&migf->lock);
82	kfree(migf);
83	return 0;
84}
85
86static struct mlx5_vhca_data_buffer *
87mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
88			      bool *end_of_data)
89{
90	struct mlx5_vhca_data_buffer *buf;
91	bool found = false;
92
93	*end_of_data = false;
94	spin_lock_irq(&migf->list_lock);
95	if (list_empty(&migf->buf_list)) {
96		*end_of_data = true;
97		goto end;
98	}
99
100	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
101			       buf_elm);
102	if (pos >= buf->start_pos &&
103	    pos < buf->start_pos + buf->length) {
104		found = true;
105		goto end;
106	}
107
108	/*
109	 * As we use a stream based FD we may expect having the data always
110	 * on first chunk
111	 */
112	migf->state = MLX5_MIGF_STATE_ERROR;
113
114end:
115	spin_unlock_irq(&migf->list_lock);
116	return found ? buf : NULL;
117}
118
119static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
120{
121	struct mlx5_vf_migration_file *migf = vhca_buf->migf;
122
123	if (vhca_buf->stop_copy_chunk_num) {
124		bool is_header = vhca_buf->dma_dir == DMA_NONE;
125		u8 chunk_num = vhca_buf->stop_copy_chunk_num;
126		size_t next_required_umem_size = 0;
127
128		if (is_header)
129			migf->buf_header[chunk_num - 1] = vhca_buf;
130		else
131			migf->buf[chunk_num - 1] = vhca_buf;
132
133		spin_lock_irq(&migf->list_lock);
134		list_del_init(&vhca_buf->buf_elm);
135		if (!is_header) {
136			next_required_umem_size =
137				migf->next_required_umem_size;
138			migf->next_required_umem_size = 0;
139			migf->num_ready_chunks--;
140		}
141		spin_unlock_irq(&migf->list_lock);
142		if (next_required_umem_size)
143			mlx5vf_mig_file_set_save_work(migf, chunk_num,
144						      next_required_umem_size);
145		return;
146	}
147
148	spin_lock_irq(&migf->list_lock);
149	list_del_init(&vhca_buf->buf_elm);
150	list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
151	spin_unlock_irq(&migf->list_lock);
152}
153
154static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
155			       char __user **buf, size_t *len, loff_t *pos)
156{
157	unsigned long offset;
158	ssize_t done = 0;
159	size_t copy_len;
160
161	copy_len = min_t(size_t,
162			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
163	while (copy_len) {
164		size_t page_offset;
165		struct page *page;
166		size_t page_len;
167		u8 *from_buff;
168		int ret;
169
170		offset = *pos - vhca_buf->start_pos;
171		page_offset = offset % PAGE_SIZE;
172		offset -= page_offset;
173		page = mlx5vf_get_migration_page(vhca_buf, offset);
174		if (!page)
175			return -EINVAL;
176		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
177		from_buff = kmap_local_page(page);
178		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
179		kunmap_local(from_buff);
180		if (ret)
181			return -EFAULT;
182		*pos += page_len;
183		*len -= page_len;
184		*buf += page_len;
185		done += page_len;
186		copy_len -= page_len;
187	}
188
189	if (*pos >= vhca_buf->start_pos + vhca_buf->length)
190		mlx5vf_buf_read_done(vhca_buf);
191
192	return done;
193}
194
195static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
196			       loff_t *pos)
197{
198	struct mlx5_vf_migration_file *migf = filp->private_data;
199	struct mlx5_vhca_data_buffer *vhca_buf;
200	bool first_loop_call = true;
201	bool end_of_data;
202	ssize_t done = 0;
203
204	if (pos)
205		return -ESPIPE;
206	pos = &filp->f_pos;
207
208	if (!(filp->f_flags & O_NONBLOCK)) {
209		if (wait_event_interruptible(migf->poll_wait,
210				!list_empty(&migf->buf_list) ||
211				migf->state == MLX5_MIGF_STATE_ERROR ||
212				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
213				migf->state == MLX5_MIGF_STATE_PRE_COPY ||
214				migf->state == MLX5_MIGF_STATE_COMPLETE))
215			return -ERESTARTSYS;
216	}
217
218	mutex_lock(&migf->lock);
219	if (migf->state == MLX5_MIGF_STATE_ERROR) {
220		done = -ENODEV;
221		goto out_unlock;
222	}
223
224	while (len) {
225		ssize_t count;
226
227		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
228							 &end_of_data);
229		if (first_loop_call) {
230			first_loop_call = false;
231			/* Temporary end of file as part of PRE_COPY */
232			if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
233				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
234				done = -ENOMSG;
235				goto out_unlock;
236			}
237
238			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
239				if (filp->f_flags & O_NONBLOCK) {
240					done = -EAGAIN;
241					goto out_unlock;
242				}
243			}
244		}
245
246		if (end_of_data)
247			goto out_unlock;
248
249		if (!vhca_buf) {
250			done = -EINVAL;
251			goto out_unlock;
252		}
253
254		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
255		if (count < 0) {
256			done = count;
257			goto out_unlock;
258		}
259		done += count;
260	}
261
262out_unlock:
263	mutex_unlock(&migf->lock);
264	return done;
265}
266
267static __poll_t mlx5vf_save_poll(struct file *filp,
268				 struct poll_table_struct *wait)
269{
270	struct mlx5_vf_migration_file *migf = filp->private_data;
271	__poll_t pollflags = 0;
272
273	poll_wait(filp, &migf->poll_wait, wait);
274
275	mutex_lock(&migf->lock);
276	if (migf->state == MLX5_MIGF_STATE_ERROR)
277		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
278	else if (!list_empty(&migf->buf_list) ||
279		 migf->state == MLX5_MIGF_STATE_COMPLETE)
280		pollflags = EPOLLIN | EPOLLRDNORM;
281	mutex_unlock(&migf->lock);
282
283	return pollflags;
284}
285
286/*
287 * FD is exposed and user can use it after receiving an error.
288 * Mark migf in error, and wake the user.
289 */
290static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
291{
292	migf->state = MLX5_MIGF_STATE_ERROR;
293	wake_up_interruptible(&migf->poll_wait);
294}
295
296void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
297				   u8 chunk_num, size_t next_required_umem_size)
298{
299	migf->save_data[chunk_num - 1].next_required_umem_size =
300			next_required_umem_size;
301	migf->save_data[chunk_num - 1].migf = migf;
302	get_file(migf->filp);
303	queue_work(migf->mvdev->cb_wq,
304		   &migf->save_data[chunk_num - 1].work);
305}
306
307static struct mlx5_vhca_data_buffer *
308mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
309				  u8 index, size_t required_length)
310{
311	struct mlx5_vhca_data_buffer *buf = migf->buf[index];
312	u8 chunk_num;
313
314	WARN_ON(!buf);
315	chunk_num = buf->stop_copy_chunk_num;
316	buf->migf->buf[index] = NULL;
317	/* Checking whether the pre-allocated buffer can fit */
318	if (buf->allocated_length >= required_length)
319		return buf;
320
321	mlx5vf_put_data_buffer(buf);
322	buf = mlx5vf_get_data_buffer(buf->migf, required_length,
323				     DMA_FROM_DEVICE);
324	if (IS_ERR(buf))
325		return buf;
326
327	buf->stop_copy_chunk_num = chunk_num;
328	return buf;
329}
330
331static void mlx5vf_mig_file_save_work(struct work_struct *_work)
332{
333	struct mlx5vf_save_work_data *save_data = container_of(_work,
334		struct mlx5vf_save_work_data, work);
335	struct mlx5_vf_migration_file *migf = save_data->migf;
336	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
337	struct mlx5_vhca_data_buffer *buf;
338
339	mutex_lock(&mvdev->state_mutex);
340	if (migf->state == MLX5_MIGF_STATE_ERROR)
341		goto end;
342
343	buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
344				save_data->chunk_num - 1,
345				save_data->next_required_umem_size);
346	if (IS_ERR(buf))
347		goto err;
348
349	if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
350		goto err_save;
351
352	goto end;
353
354err_save:
355	mlx5vf_put_data_buffer(buf);
356err:
357	mlx5vf_mark_err(migf);
358end:
359	mlx5vf_state_mutex_unlock(mvdev);
360	fput(migf->filp);
361}
362
363static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
364				       bool track)
365{
366	size_t size = sizeof(struct mlx5_vf_migration_header) +
367		sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
368	struct mlx5_vf_migration_tag_stop_copy_data data = {};
369	struct mlx5_vhca_data_buffer *header_buf = NULL;
370	struct mlx5_vf_migration_header header = {};
371	unsigned long flags;
372	struct page *page;
373	u8 *to_buff;
374	int ret;
375
376	header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
377	if (IS_ERR(header_buf))
378		return PTR_ERR(header_buf);
379
380	header.record_size = cpu_to_le64(sizeof(data));
381	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
382	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
383	page = mlx5vf_get_migration_page(header_buf, 0);
384	if (!page) {
385		ret = -EINVAL;
386		goto err;
387	}
388	to_buff = kmap_local_page(page);
389	memcpy(to_buff, &header, sizeof(header));
390	header_buf->length = sizeof(header);
391	data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
392	memcpy(to_buff + sizeof(header), &data, sizeof(data));
393	header_buf->length += sizeof(data);
394	kunmap_local(to_buff);
395	header_buf->start_pos = header_buf->migf->max_pos;
396	migf->max_pos += header_buf->length;
397	spin_lock_irqsave(&migf->list_lock, flags);
398	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
399	spin_unlock_irqrestore(&migf->list_lock, flags);
400	if (track)
401		migf->pre_copy_initial_bytes = size;
402	return 0;
403err:
404	mlx5vf_put_data_buffer(header_buf);
405	return ret;
406}
407
408static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
409				 struct mlx5_vf_migration_file *migf,
410				 size_t state_size, u64 full_size,
411				 bool track)
412{
413	struct mlx5_vhca_data_buffer *buf;
414	size_t inc_state_size;
415	int num_chunks;
416	int ret;
417	int i;
418
419	if (mvdev->chunk_mode) {
420		size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
421
422		/* from firmware perspective at least 'state_size' buffer should be set */
423		inc_state_size = max(state_size, chunk_size);
424	} else {
425		if (track) {
426			/* let's be ready for stop_copy size that might grow by 10 percents */
427			if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
428				inc_state_size = state_size;
429		} else {
430			inc_state_size = state_size;
431		}
432	}
433
434	/* let's not overflow the device specification max SAVE size */
435	inc_state_size = min_t(size_t, inc_state_size,
436		(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
437
438	num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
439	for (i = 0; i < num_chunks; i++) {
440		buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
441		if (IS_ERR(buf)) {
442			ret = PTR_ERR(buf);
443			goto err;
444		}
445
446		migf->buf[i] = buf;
447		buf = mlx5vf_get_data_buffer(migf,
448				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
449		if (IS_ERR(buf)) {
450			ret = PTR_ERR(buf);
451			goto err;
452		}
453		migf->buf_header[i] = buf;
454		if (mvdev->chunk_mode) {
455			migf->buf[i]->stop_copy_chunk_num = i + 1;
456			migf->buf_header[i]->stop_copy_chunk_num = i + 1;
457			INIT_WORK(&migf->save_data[i].work,
458				  mlx5vf_mig_file_save_work);
459			migf->save_data[i].chunk_num = i + 1;
460		}
461	}
462
463	ret = mlx5vf_add_stop_copy_header(migf, track);
464	if (ret)
465		goto err;
466	return 0;
467
468err:
469	for (i = 0; i < num_chunks; i++) {
470		if (migf->buf[i]) {
471			mlx5vf_put_data_buffer(migf->buf[i]);
472			migf->buf[i] = NULL;
473		}
474		if (migf->buf_header[i]) {
475			mlx5vf_put_data_buffer(migf->buf_header[i]);
476			migf->buf_header[i] = NULL;
477		}
478	}
479
480	return ret;
481}
482
483static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
484				 unsigned long arg)
485{
486	struct mlx5_vf_migration_file *migf = filp->private_data;
487	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
488	struct mlx5_vhca_data_buffer *buf;
489	struct vfio_precopy_info info = {};
490	loff_t *pos = &filp->f_pos;
491	unsigned long minsz;
492	size_t inc_length = 0;
493	bool end_of_data = false;
494	int ret;
495
496	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
497		return -ENOTTY;
498
499	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
500
501	if (copy_from_user(&info, (void __user *)arg, minsz))
502		return -EFAULT;
503
504	if (info.argsz < minsz)
505		return -EINVAL;
506
507	mutex_lock(&mvdev->state_mutex);
508	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
509	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
510		ret = -EINVAL;
511		goto err_state_unlock;
512	}
513
514	/*
515	 * We can't issue a SAVE command when the device is suspended, so as
516	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
517	 * bytes that can't be read.
518	 */
519	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
520		/*
521		 * Once the query returns it's guaranteed that there is no
522		 * active SAVE command.
523		 * As so, the other code below is safe with the proper locks.
524		 */
525		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
526							    NULL, MLX5VF_QUERY_INC);
527		if (ret)
528			goto err_state_unlock;
529	}
530
531	mutex_lock(&migf->lock);
532	if (migf->state == MLX5_MIGF_STATE_ERROR) {
533		ret = -ENODEV;
534		goto err_migf_unlock;
535	}
536
537	if (migf->pre_copy_initial_bytes > *pos) {
538		info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
539	} else {
540		info.dirty_bytes = migf->max_pos - *pos;
541		if (!info.dirty_bytes)
542			end_of_data = true;
543		info.dirty_bytes += inc_length;
544	}
545
546	if (!end_of_data || !inc_length) {
547		mutex_unlock(&migf->lock);
548		goto done;
549	}
550
551	mutex_unlock(&migf->lock);
552	/*
553	 * We finished transferring the current state and the device has a
554	 * dirty state, save a new state to be ready for.
555	 */
556	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
557	if (IS_ERR(buf)) {
558		ret = PTR_ERR(buf);
559		mlx5vf_mark_err(migf);
560		goto err_state_unlock;
561	}
562
563	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
564	if (ret) {
565		mlx5vf_mark_err(migf);
566		mlx5vf_put_data_buffer(buf);
567		goto err_state_unlock;
568	}
569
570done:
571	mlx5vf_state_mutex_unlock(mvdev);
572	if (copy_to_user((void __user *)arg, &info, minsz))
573		return -EFAULT;
574	return 0;
575
576err_migf_unlock:
577	mutex_unlock(&migf->lock);
578err_state_unlock:
579	mlx5vf_state_mutex_unlock(mvdev);
580	return ret;
581}
582
583static const struct file_operations mlx5vf_save_fops = {
584	.owner = THIS_MODULE,
585	.read = mlx5vf_save_read,
586	.poll = mlx5vf_save_poll,
587	.unlocked_ioctl = mlx5vf_precopy_ioctl,
588	.compat_ioctl = compat_ptr_ioctl,
589	.release = mlx5vf_release_file,
590	.llseek = no_llseek,
591};
592
593static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
594{
595	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
596	struct mlx5_vhca_data_buffer *buf;
597	size_t length;
598	int ret;
599
600	if (migf->state == MLX5_MIGF_STATE_ERROR)
601		return -ENODEV;
602
603	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
604				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
605	if (ret)
606		goto err;
607
608	buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
609	if (IS_ERR(buf)) {
610		ret = PTR_ERR(buf);
611		goto err;
612	}
613
614	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
615	if (ret)
616		goto err_save;
617
618	return 0;
619
620err_save:
621	mlx5vf_put_data_buffer(buf);
622err:
623	mlx5vf_mark_err(migf);
624	return ret;
625}
626
627static struct mlx5_vf_migration_file *
628mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
629{
630	struct mlx5_vf_migration_file *migf;
631	struct mlx5_vhca_data_buffer *buf;
632	size_t length;
633	u64 full_size;
634	int ret;
635
636	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
637	if (!migf)
638		return ERR_PTR(-ENOMEM);
639
640	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
641					O_RDONLY);
642	if (IS_ERR(migf->filp)) {
643		ret = PTR_ERR(migf->filp);
644		goto end;
645	}
646
647	migf->mvdev = mvdev;
648	ret = mlx5vf_cmd_alloc_pd(migf);
649	if (ret)
650		goto out_free;
651
652	stream_open(migf->filp->f_inode, migf->filp);
653	mutex_init(&migf->lock);
654	init_waitqueue_head(&migf->poll_wait);
655	init_completion(&migf->save_comp);
656	/*
657	 * save_comp is being used as a binary semaphore built from
658	 * a completion. A normal mutex cannot be used because the lock is
659	 * passed between kernel threads and lockdep can't model this.
660	 */
661	complete(&migf->save_comp);
662	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
663	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
664	INIT_LIST_HEAD(&migf->buf_list);
665	INIT_LIST_HEAD(&migf->avail_list);
666	spin_lock_init(&migf->list_lock);
667	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
668	if (ret)
669		goto out_pd;
670
671	ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
672	if (ret)
673		goto out_pd;
674
675	if (track) {
676		/* leave the allocated buffer ready for the stop-copy phase */
677		buf = mlx5vf_alloc_data_buffer(migf,
678			migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
679		if (IS_ERR(buf)) {
680			ret = PTR_ERR(buf);
681			goto out_pd;
682		}
683	} else {
684		buf = migf->buf[0];
685		migf->buf[0] = NULL;
686	}
687
688	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
689	if (ret)
690		goto out_save;
691	return migf;
692out_save:
693	mlx5vf_free_data_buffer(buf);
694out_pd:
695	mlx5fv_cmd_clean_migf_resources(migf);
696out_free:
697	fput(migf->filp);
698end:
699	kfree(migf);
700	return ERR_PTR(ret);
701}
702
703static int
704mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
705			      const char __user **buf, size_t *len,
706			      loff_t *pos, ssize_t *done)
707{
708	unsigned long offset;
709	size_t page_offset;
710	struct page *page;
711	size_t page_len;
712	u8 *to_buff;
713	int ret;
714
715	offset = *pos - vhca_buf->start_pos;
716	page_offset = offset % PAGE_SIZE;
717
718	page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
719	if (!page)
720		return -EINVAL;
721	page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
722	to_buff = kmap_local_page(page);
723	ret = copy_from_user(to_buff + page_offset, *buf, page_len);
724	kunmap_local(to_buff);
725	if (ret)
726		return -EFAULT;
727
728	*pos += page_len;
729	*done += page_len;
730	*buf += page_len;
731	*len -= page_len;
732	vhca_buf->length += page_len;
733	return 0;
734}
735
736static ssize_t
737mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
738			 struct mlx5_vhca_data_buffer *vhca_buf,
739			 size_t image_size, const char __user **buf,
740			 size_t *len, loff_t *pos, ssize_t *done,
741			 bool *has_work)
742{
743	size_t copy_len, to_copy;
744	int ret;
745
746	to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
747	copy_len = to_copy;
748	while (to_copy) {
749		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
750						    done);
751		if (ret)
752			return ret;
753	}
754
755	*len -= copy_len;
756	if (vhca_buf->length == image_size) {
757		migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
758		migf->max_pos += image_size;
759		*has_work = true;
760	}
761
762	return 0;
763}
764
765static int
766mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
767			       struct mlx5_vhca_data_buffer *vhca_buf,
768			       const char __user **buf, size_t *len,
769			       loff_t *pos, ssize_t *done)
770{
771	size_t copy_len, to_copy;
772	size_t required_data;
773	u8 *to_buff;
774	int ret;
775
776	required_data = migf->record_size - vhca_buf->length;
777	to_copy = min_t(size_t, *len, required_data);
778	copy_len = to_copy;
779	while (to_copy) {
780		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
781						    done);
782		if (ret)
783			return ret;
784	}
785
786	*len -= copy_len;
787	if (vhca_buf->length == migf->record_size) {
788		switch (migf->record_tag) {
789		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
790		{
791			struct page *page;
792
793			page = mlx5vf_get_migration_page(vhca_buf, 0);
794			if (!page)
795				return -EINVAL;
796			to_buff = kmap_local_page(page);
797			migf->stop_copy_prep_size = min_t(u64,
798				le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
799			kunmap_local(to_buff);
800			break;
801		}
802		default:
803			/* Optional tag */
804			break;
805		}
806
807		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
808		migf->max_pos += migf->record_size;
809		vhca_buf->length = 0;
810	}
811
812	return 0;
813}
814
815static int
816mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
817			  struct mlx5_vhca_data_buffer *vhca_buf,
818			  const char __user **buf,
819			  size_t *len, loff_t *pos,
820			  ssize_t *done, bool *has_work)
821{
822	struct page *page;
823	size_t copy_len;
824	u8 *to_buff;
825	int ret;
826
827	copy_len = min_t(size_t, *len,
828		sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
829	page = mlx5vf_get_migration_page(vhca_buf, 0);
830	if (!page)
831		return -EINVAL;
832	to_buff = kmap_local_page(page);
833	ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
834	if (ret) {
835		ret = -EFAULT;
836		goto end;
837	}
838
839	*buf += copy_len;
840	*pos += copy_len;
841	*done += copy_len;
842	*len -= copy_len;
843	vhca_buf->length += copy_len;
844	if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
845		u64 record_size;
846		u32 flags;
847
848		record_size = le64_to_cpup((__le64 *)to_buff);
849		if (record_size > MAX_LOAD_SIZE) {
850			ret = -ENOMEM;
851			goto end;
852		}
853
854		migf->record_size = record_size;
855		flags = le32_to_cpup((__le32 *)(to_buff +
856			    offsetof(struct mlx5_vf_migration_header, flags)));
857		migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
858			    offsetof(struct mlx5_vf_migration_header, tag)));
859		switch (migf->record_tag) {
860		case MLX5_MIGF_HEADER_TAG_FW_DATA:
861			migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
862			break;
863		case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
864			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
865			break;
866		default:
867			if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
868				ret = -EOPNOTSUPP;
869				goto end;
870			}
871			/* We may read and skip this optional record data */
872			migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
873		}
874
875		migf->max_pos += vhca_buf->length;
876		vhca_buf->length = 0;
877		*has_work = true;
878	}
879end:
880	kunmap_local(to_buff);
881	return ret;
882}
883
884static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
885				   size_t len, loff_t *pos)
886{
887	struct mlx5_vf_migration_file *migf = filp->private_data;
888	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
889	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
890	loff_t requested_length;
891	bool has_work = false;
892	ssize_t done = 0;
893	int ret = 0;
894
895	if (pos)
896		return -ESPIPE;
897	pos = &filp->f_pos;
898
899	if (*pos < 0 ||
900	    check_add_overflow((loff_t)len, *pos, &requested_length))
901		return -EINVAL;
902
903	mutex_lock(&migf->mvdev->state_mutex);
904	mutex_lock(&migf->lock);
905	if (migf->state == MLX5_MIGF_STATE_ERROR) {
906		ret = -ENODEV;
907		goto out_unlock;
908	}
909
910	while (len || has_work) {
911		has_work = false;
912		switch (migf->load_state) {
913		case MLX5_VF_LOAD_STATE_READ_HEADER:
914			ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
915							&buf, &len, pos,
916							&done, &has_work);
917			if (ret)
918				goto out_unlock;
919			break;
920		case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
921			if (vhca_buf_header->allocated_length < migf->record_size) {
922				mlx5vf_free_data_buffer(vhca_buf_header);
923
924				migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
925						migf->record_size, DMA_NONE);
926				if (IS_ERR(migf->buf_header[0])) {
927					ret = PTR_ERR(migf->buf_header[0]);
928					migf->buf_header[0] = NULL;
929					goto out_unlock;
930				}
931
932				vhca_buf_header = migf->buf_header[0];
933			}
934
935			vhca_buf_header->start_pos = migf->max_pos;
936			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
937			break;
938		case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
939			ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
940							&buf, &len, pos, &done);
941			if (ret)
942				goto out_unlock;
943			break;
944		case MLX5_VF_LOAD_STATE_PREP_IMAGE:
945		{
946			u64 size = max(migf->record_size,
947				       migf->stop_copy_prep_size);
948
949			if (vhca_buf->allocated_length < size) {
950				mlx5vf_free_data_buffer(vhca_buf);
951
952				migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
953							size, DMA_TO_DEVICE);
954				if (IS_ERR(migf->buf[0])) {
955					ret = PTR_ERR(migf->buf[0]);
956					migf->buf[0] = NULL;
957					goto out_unlock;
958				}
959
960				vhca_buf = migf->buf[0];
961			}
962
963			vhca_buf->start_pos = migf->max_pos;
964			migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
965			break;
966		}
967		case MLX5_VF_LOAD_STATE_READ_IMAGE:
968			ret = mlx5vf_resume_read_image(migf, vhca_buf,
969						migf->record_size,
970						&buf, &len, pos, &done, &has_work);
971			if (ret)
972				goto out_unlock;
973			break;
974		case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
975			ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
976			if (ret)
977				goto out_unlock;
978			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
979
980			/* prep header buf for next image */
981			vhca_buf_header->length = 0;
982			/* prep data buf for next image */
983			vhca_buf->length = 0;
984
985			break;
986		default:
987			break;
988		}
989	}
990
991out_unlock:
992	if (ret)
993		migf->state = MLX5_MIGF_STATE_ERROR;
994	mutex_unlock(&migf->lock);
995	mlx5vf_state_mutex_unlock(migf->mvdev);
996	return ret ? ret : done;
997}
998
999static const struct file_operations mlx5vf_resume_fops = {
1000	.owner = THIS_MODULE,
1001	.write = mlx5vf_resume_write,
1002	.release = mlx5vf_release_file,
1003	.llseek = no_llseek,
1004};
1005
1006static struct mlx5_vf_migration_file *
1007mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
1008{
1009	struct mlx5_vf_migration_file *migf;
1010	struct mlx5_vhca_data_buffer *buf;
1011	int ret;
1012
1013	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
1014	if (!migf)
1015		return ERR_PTR(-ENOMEM);
1016
1017	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
1018					O_WRONLY);
1019	if (IS_ERR(migf->filp)) {
1020		ret = PTR_ERR(migf->filp);
1021		goto end;
1022	}
1023
1024	migf->mvdev = mvdev;
1025	ret = mlx5vf_cmd_alloc_pd(migf);
1026	if (ret)
1027		goto out_free;
1028
1029	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
1030	if (IS_ERR(buf)) {
1031		ret = PTR_ERR(buf);
1032		goto out_pd;
1033	}
1034
1035	migf->buf[0] = buf;
1036	buf = mlx5vf_alloc_data_buffer(migf,
1037		sizeof(struct mlx5_vf_migration_header), DMA_NONE);
1038	if (IS_ERR(buf)) {
1039		ret = PTR_ERR(buf);
1040		goto out_buf;
1041	}
1042
1043	migf->buf_header[0] = buf;
1044	migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
1045
1046	stream_open(migf->filp->f_inode, migf->filp);
1047	mutex_init(&migf->lock);
1048	INIT_LIST_HEAD(&migf->buf_list);
1049	INIT_LIST_HEAD(&migf->avail_list);
1050	spin_lock_init(&migf->list_lock);
1051	return migf;
1052out_buf:
1053	mlx5vf_free_data_buffer(migf->buf[0]);
1054out_pd:
1055	mlx5vf_cmd_dealloc_pd(migf);
1056out_free:
1057	fput(migf->filp);
1058end:
1059	kfree(migf);
1060	return ERR_PTR(ret);
1061}
1062
1063void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
1064			enum mlx5_vf_migf_state *last_save_state)
1065{
1066	if (mvdev->resuming_migf) {
1067		mlx5vf_disable_fd(mvdev->resuming_migf);
1068		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1069		fput(mvdev->resuming_migf->filp);
1070		mvdev->resuming_migf = NULL;
1071	}
1072	if (mvdev->saving_migf) {
1073		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1074		cancel_work_sync(&mvdev->saving_migf->async_data.work);
1075		if (last_save_state)
1076			*last_save_state = mvdev->saving_migf->state;
1077		mlx5vf_disable_fd(mvdev->saving_migf);
1078		wake_up_interruptible(&mvdev->saving_migf->poll_wait);
1079		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1080		fput(mvdev->saving_migf->filp);
1081		mvdev->saving_migf = NULL;
1082	}
1083}
1084
1085static struct file *
1086mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1087				    u32 new)
1088{
1089	u32 cur = mvdev->mig_state;
1090	int ret;
1091
1092	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1093		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1094			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1095		if (ret)
1096			return ERR_PTR(ret);
1097		return NULL;
1098	}
1099
1100	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1101		ret = mlx5vf_cmd_resume_vhca(mvdev,
1102			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1103		if (ret)
1104			return ERR_PTR(ret);
1105		return NULL;
1106	}
1107
1108	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1109	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1110		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1111			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1112		if (ret)
1113			return ERR_PTR(ret);
1114		return NULL;
1115	}
1116
1117	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1118	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1119		ret = mlx5vf_cmd_resume_vhca(mvdev,
1120			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1121		if (ret)
1122			return ERR_PTR(ret);
1123		return NULL;
1124	}
1125
1126	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1127		struct mlx5_vf_migration_file *migf;
1128
1129		migf = mlx5vf_pci_save_device_data(mvdev, false);
1130		if (IS_ERR(migf))
1131			return ERR_CAST(migf);
1132		get_file(migf->filp);
1133		mvdev->saving_migf = migf;
1134		return migf->filp;
1135	}
1136
1137	if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
1138		mlx5vf_disable_fds(mvdev, NULL);
1139		return NULL;
1140	}
1141
1142	if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1143	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1144	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1145		struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
1146		struct mlx5_vhca_data_buffer *buf;
1147		enum mlx5_vf_migf_state state;
1148		size_t size;
1149
1150		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL,
1151					MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
1152		if (ret)
1153			return ERR_PTR(ret);
1154		buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
1155		if (IS_ERR(buf))
1156			return ERR_CAST(buf);
1157		/* pre_copy cleanup */
1158		ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, false);
1159		if (ret) {
1160			mlx5vf_put_data_buffer(buf);
1161			return ERR_PTR(ret);
1162		}
1163		mlx5vf_disable_fds(mvdev, &state);
1164		return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(-EIO);
1165	}
1166
1167	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1168		struct mlx5_vf_migration_file *migf;
1169
1170		migf = mlx5vf_pci_resume_device_data(mvdev);
1171		if (IS_ERR(migf))
1172			return ERR_CAST(migf);
1173		get_file(migf->filp);
1174		mvdev->resuming_migf = migf;
1175		return migf->filp;
1176	}
1177
1178	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1179		mlx5vf_disable_fds(mvdev, NULL);
1180		return NULL;
1181	}
1182
1183	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1184	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1185	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1186		struct mlx5_vf_migration_file *migf;
1187
1188		migf = mlx5vf_pci_save_device_data(mvdev, true);
1189		if (IS_ERR(migf))
1190			return ERR_CAST(migf);
1191		get_file(migf->filp);
1192		mvdev->saving_migf = migf;
1193		return migf->filp;
1194	}
1195
1196	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1197		ret = mlx5vf_cmd_suspend_vhca(mvdev,
1198			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1199		if (ret)
1200			return ERR_PTR(ret);
1201		ret = mlx5vf_pci_save_device_inc_data(mvdev);
1202		return ret ? ERR_PTR(ret) : NULL;
1203	}
1204
1205	/*
1206	 * vfio_mig_get_next_state() does not use arcs other than the above
1207	 */
1208	WARN_ON(true);
1209	return ERR_PTR(-EINVAL);
1210}
1211
1212/*
1213 * This function is called in all state_mutex unlock cases to
1214 * handle a 'deferred_reset' if exists.
1215 */
1216void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1217{
1218again:
1219	spin_lock(&mvdev->reset_lock);
1220	if (mvdev->deferred_reset) {
1221		mvdev->deferred_reset = false;
1222		spin_unlock(&mvdev->reset_lock);
1223		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1224		mlx5vf_disable_fds(mvdev, NULL);
1225		goto again;
1226	}
1227	mutex_unlock(&mvdev->state_mutex);
1228	spin_unlock(&mvdev->reset_lock);
1229}
1230
1231static struct file *
1232mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1233			    enum vfio_device_mig_state new_state)
1234{
1235	struct mlx5vf_pci_core_device *mvdev = container_of(
1236		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1237	enum vfio_device_mig_state next_state;
1238	struct file *res = NULL;
1239	int ret;
1240
1241	mutex_lock(&mvdev->state_mutex);
1242	while (new_state != mvdev->mig_state) {
1243		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1244					      new_state, &next_state);
1245		if (ret) {
1246			res = ERR_PTR(ret);
1247			break;
1248		}
1249		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1250		if (IS_ERR(res))
1251			break;
1252		mvdev->mig_state = next_state;
1253		if (WARN_ON(res && new_state != mvdev->mig_state)) {
1254			fput(res);
1255			res = ERR_PTR(-EINVAL);
1256			break;
1257		}
1258	}
1259	mlx5vf_state_mutex_unlock(mvdev);
1260	return res;
1261}
1262
1263static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1264				    unsigned long *stop_copy_length)
1265{
1266	struct mlx5vf_pci_core_device *mvdev = container_of(
1267		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1268	size_t state_size;
1269	u64 total_size;
1270	int ret;
1271
1272	mutex_lock(&mvdev->state_mutex);
1273	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
1274						    &total_size, 0);
1275	if (!ret)
1276		*stop_copy_length = total_size;
1277	mlx5vf_state_mutex_unlock(mvdev);
1278	return ret;
1279}
1280
1281static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1282				       enum vfio_device_mig_state *curr_state)
1283{
1284	struct mlx5vf_pci_core_device *mvdev = container_of(
1285		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1286
1287	mutex_lock(&mvdev->state_mutex);
1288	*curr_state = mvdev->mig_state;
1289	mlx5vf_state_mutex_unlock(mvdev);
1290	return 0;
1291}
1292
1293static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1294{
1295	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1296
1297	if (!mvdev->migrate_cap)
1298		return;
1299
1300	/*
1301	 * As the higher VFIO layers are holding locks across reset and using
1302	 * those same locks with the mm_lock we need to prevent ABBA deadlock
1303	 * with the state_mutex and mm_lock.
1304	 * In case the state_mutex was taken already we defer the cleanup work
1305	 * to the unlock flow of the other running context.
1306	 */
1307	spin_lock(&mvdev->reset_lock);
1308	mvdev->deferred_reset = true;
1309	if (!mutex_trylock(&mvdev->state_mutex)) {
1310		spin_unlock(&mvdev->reset_lock);
1311		return;
1312	}
1313	spin_unlock(&mvdev->reset_lock);
1314	mlx5vf_state_mutex_unlock(mvdev);
1315}
1316
1317static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1318{
1319	struct mlx5vf_pci_core_device *mvdev = container_of(
1320		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1321	struct vfio_pci_core_device *vdev = &mvdev->core_device;
1322	int ret;
1323
1324	ret = vfio_pci_core_enable(vdev);
1325	if (ret)
1326		return ret;
1327
1328	if (mvdev->migrate_cap)
1329		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1330	vfio_pci_core_finish_enable(vdev);
1331	return 0;
1332}
1333
1334static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1335{
1336	struct mlx5vf_pci_core_device *mvdev = container_of(
1337		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1338
1339	mlx5vf_cmd_close_migratable(mvdev);
1340	vfio_pci_core_close_device(core_vdev);
1341}
1342
1343static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1344	.migration_set_state = mlx5vf_pci_set_device_state,
1345	.migration_get_state = mlx5vf_pci_get_device_state,
1346	.migration_get_data_size = mlx5vf_pci_get_data_size,
1347};
1348
1349static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1350	.log_start = mlx5vf_start_page_tracker,
1351	.log_stop = mlx5vf_stop_page_tracker,
1352	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
1353};
1354
1355static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1356{
1357	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1358			struct mlx5vf_pci_core_device, core_device.vdev);
1359	int ret;
1360
1361	ret = vfio_pci_core_init_dev(core_vdev);
1362	if (ret)
1363		return ret;
1364
1365	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1366				  &mlx5vf_pci_log_ops);
1367
1368	return 0;
1369}
1370
1371static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1372{
1373	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1374			struct mlx5vf_pci_core_device, core_device.vdev);
1375
1376	mlx5vf_cmd_remove_migratable(mvdev);
1377	vfio_pci_core_release_dev(core_vdev);
1378}
1379
1380static const struct vfio_device_ops mlx5vf_pci_ops = {
1381	.name = "mlx5-vfio-pci",
1382	.init = mlx5vf_pci_init_dev,
1383	.release = mlx5vf_pci_release_dev,
1384	.open_device = mlx5vf_pci_open_device,
1385	.close_device = mlx5vf_pci_close_device,
1386	.ioctl = vfio_pci_core_ioctl,
1387	.device_feature = vfio_pci_core_ioctl_feature,
1388	.read = vfio_pci_core_read,
1389	.write = vfio_pci_core_write,
1390	.mmap = vfio_pci_core_mmap,
1391	.request = vfio_pci_core_request,
1392	.match = vfio_pci_core_match,
1393	.bind_iommufd = vfio_iommufd_physical_bind,
1394	.unbind_iommufd = vfio_iommufd_physical_unbind,
1395	.attach_ioas = vfio_iommufd_physical_attach_ioas,
1396	.detach_ioas = vfio_iommufd_physical_detach_ioas,
1397};
1398
1399static int mlx5vf_pci_probe(struct pci_dev *pdev,
1400			    const struct pci_device_id *id)
1401{
1402	struct mlx5vf_pci_core_device *mvdev;
1403	int ret;
1404
1405	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1406				  &pdev->dev, &mlx5vf_pci_ops);
1407	if (IS_ERR(mvdev))
1408		return PTR_ERR(mvdev);
1409
1410	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1411	ret = vfio_pci_core_register_device(&mvdev->core_device);
1412	if (ret)
1413		goto out_put_vdev;
1414	return 0;
1415
1416out_put_vdev:
1417	vfio_put_device(&mvdev->core_device.vdev);
1418	return ret;
1419}
1420
1421static void mlx5vf_pci_remove(struct pci_dev *pdev)
1422{
1423	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1424
1425	vfio_pci_core_unregister_device(&mvdev->core_device);
1426	vfio_put_device(&mvdev->core_device.vdev);
1427}
1428
1429static const struct pci_device_id mlx5vf_pci_table[] = {
1430	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1431	{}
1432};
1433
1434MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1435
1436static const struct pci_error_handlers mlx5vf_err_handlers = {
1437	.reset_done = mlx5vf_pci_aer_reset_done,
1438	.error_detected = vfio_pci_core_aer_err_detected,
1439};
1440
1441static struct pci_driver mlx5vf_pci_driver = {
1442	.name = KBUILD_MODNAME,
1443	.id_table = mlx5vf_pci_table,
1444	.probe = mlx5vf_pci_probe,
1445	.remove = mlx5vf_pci_remove,
1446	.err_handler = &mlx5vf_err_handlers,
1447	.driver_managed_dma = true,
1448};
1449
1450module_pci_driver(mlx5vf_pci_driver);
1451
1452MODULE_IMPORT_NS(IOMMUFD);
1453MODULE_LICENSE("GPL");
1454MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1455MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1456MODULE_DESCRIPTION(
1457	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1458