1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2/*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6#include "cmd.h"
7
8enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9
10static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11{
12	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13	void *query_cap = NULL, *cap;
14	int ret;
15
16	query_cap = kzalloc(query_sz, GFP_KERNEL);
17	if (!query_cap)
18		return -ENOMEM;
19
20	ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap,
21					    MLX5_CAP_GENERAL_2);
22	if (ret)
23		goto out;
24
25	cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26	if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27		ret = -EOPNOTSUPP;
28out:
29	kfree(query_cap);
30	return ret;
31}
32
33static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34				  u16 *vhca_id);
35static void
36_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37
38int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39{
40	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41	u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42	u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43	int err;
44
45	lockdep_assert_held(&mvdev->state_mutex);
46	if (mvdev->mdev_detach)
47		return -ENOTCONN;
48
49	/*
50	 * In case PRE_COPY is used, saving_migf is exposed while the device is
51	 * running. Make sure to run only once there is no active save command.
52	 * Running both in parallel, might end-up with a failure in the save
53	 * command once it will try to turn on 'tracking' on a suspended device.
54	 */
55	if (migf) {
56		err = wait_for_completion_interruptible(&migf->save_comp);
57		if (err)
58			return err;
59	}
60
61	MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62	MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63	MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64
65	err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66	if (migf)
67		complete(&migf->save_comp);
68
69	return err;
70}
71
72int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73{
74	u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75	u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76
77	lockdep_assert_held(&mvdev->state_mutex);
78	if (mvdev->mdev_detach)
79		return -ENOTCONN;
80
81	MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82	MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83	MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84
85	return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86}
87
88int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89					  size_t *state_size, u64 *total_size,
90					  u8 query_flags)
91{
92	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
93	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
94	bool inc = query_flags & MLX5VF_QUERY_INC;
95	int ret;
96
97	lockdep_assert_held(&mvdev->state_mutex);
98	if (mvdev->mdev_detach)
99		return -ENOTCONN;
100
101	/*
102	 * In case PRE_COPY is used, saving_migf is exposed while device is
103	 * running. Make sure to run only once there is no active save command.
104	 * Running both in parallel, might end-up with a failure in the
105	 * incremental query command on un-tracked vhca.
106	 */
107	if (inc) {
108		ret = wait_for_completion_interruptible(&mvdev->saving_migf->save_comp);
109		if (ret)
110			return ret;
111		/* Upon cleanup, ignore previous pre_copy error state */
112		if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
113		    !(query_flags & MLX5VF_QUERY_CLEANUP)) {
114			/*
115			 * In case we had a PRE_COPY error, only query full
116			 * image for final image
117			 */
118			if (!(query_flags & MLX5VF_QUERY_FINAL)) {
119				*state_size = 0;
120				complete(&mvdev->saving_migf->save_comp);
121				return 0;
122			}
123			query_flags &= ~MLX5VF_QUERY_INC;
124		}
125		/* Block incremental query which is state-dependent */
126		if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
127			complete(&mvdev->saving_migf->save_comp);
128			return -ENODEV;
129		}
130	}
131
132	MLX5_SET(query_vhca_migration_state_in, in, opcode,
133		 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
134	MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
135	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
136	MLX5_SET(query_vhca_migration_state_in, in, incremental,
137		 query_flags & MLX5VF_QUERY_INC);
138	MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
139
140	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
141				  out);
142	if (inc)
143		complete(&mvdev->saving_migf->save_comp);
144
145	if (ret)
146		return ret;
147
148	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
149			       required_umem_size);
150	if (total_size)
151		*total_size = mvdev->chunk_mode ?
152			MLX5_GET64(query_vhca_migration_state_out, out,
153				   remaining_total_size) : *state_size;
154
155	return 0;
156}
157
158static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
159{
160	mvdev->tracker.object_changed = true;
161	complete(&mvdev->tracker_comp);
162}
163
164static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
165{
166	/* Mark the tracker under an error and wake it up if it's running */
167	mvdev->tracker.is_err = true;
168	complete(&mvdev->tracker_comp);
169}
170
171static int mlx5fv_vf_event(struct notifier_block *nb,
172			   unsigned long event, void *data)
173{
174	struct mlx5vf_pci_core_device *mvdev =
175		container_of(nb, struct mlx5vf_pci_core_device, nb);
176
177	switch (event) {
178	case MLX5_PF_NOTIFY_ENABLE_VF:
179		mutex_lock(&mvdev->state_mutex);
180		mvdev->mdev_detach = false;
181		mlx5vf_state_mutex_unlock(mvdev);
182		break;
183	case MLX5_PF_NOTIFY_DISABLE_VF:
184		mlx5vf_cmd_close_migratable(mvdev);
185		mutex_lock(&mvdev->state_mutex);
186		mvdev->mdev_detach = true;
187		mlx5vf_state_mutex_unlock(mvdev);
188		break;
189	default:
190		break;
191	}
192
193	return 0;
194}
195
196void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
197{
198	if (!mvdev->migrate_cap)
199		return;
200
201	/* Must be done outside the lock to let it progress */
202	set_tracker_error(mvdev);
203	mutex_lock(&mvdev->state_mutex);
204	mlx5vf_disable_fds(mvdev, NULL);
205	_mlx5vf_free_page_tracker_resources(mvdev);
206	mlx5vf_state_mutex_unlock(mvdev);
207}
208
209void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
210{
211	if (!mvdev->migrate_cap)
212		return;
213
214	mlx5_sriov_blocking_notifier_unregister(mvdev->mdev, mvdev->vf_id,
215						&mvdev->nb);
216	destroy_workqueue(mvdev->cb_wq);
217}
218
219void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
220			       const struct vfio_migration_ops *mig_ops,
221			       const struct vfio_log_ops *log_ops)
222{
223	struct pci_dev *pdev = mvdev->core_device.pdev;
224	int ret;
225
226	if (!pdev->is_virtfn)
227		return;
228
229	mvdev->mdev = mlx5_vf_get_core_dev(pdev);
230	if (!mvdev->mdev)
231		return;
232
233	if (!MLX5_CAP_GEN(mvdev->mdev, migration))
234		goto end;
235
236	if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
237	      MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)))
238		goto end;
239
240	mvdev->vf_id = pci_iov_vf_id(pdev);
241	if (mvdev->vf_id < 0)
242		goto end;
243
244	ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1);
245	if (ret)
246		goto end;
247
248	if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1,
249				   &mvdev->vhca_id))
250		goto end;
251
252	mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
253	if (!mvdev->cb_wq)
254		goto end;
255
256	mutex_init(&mvdev->state_mutex);
257	spin_lock_init(&mvdev->reset_lock);
258	mvdev->nb.notifier_call = mlx5fv_vf_event;
259	ret = mlx5_sriov_blocking_notifier_register(mvdev->mdev, mvdev->vf_id,
260						    &mvdev->nb);
261	if (ret) {
262		destroy_workqueue(mvdev->cb_wq);
263		goto end;
264	}
265
266	mvdev->migrate_cap = 1;
267	mvdev->core_device.vdev.migration_flags =
268		VFIO_MIGRATION_STOP_COPY |
269		VFIO_MIGRATION_P2P |
270		VFIO_MIGRATION_PRE_COPY;
271
272	mvdev->core_device.vdev.mig_ops = mig_ops;
273	init_completion(&mvdev->tracker_comp);
274	if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
275		mvdev->core_device.vdev.log_ops = log_ops;
276
277	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
278		mvdev->chunk_mode = 1;
279
280end:
281	mlx5_vf_put_core_dev(mvdev->mdev);
282}
283
284static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
285				  u16 *vhca_id)
286{
287	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
288	int out_size;
289	void *out;
290	int ret;
291
292	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
293	out = kzalloc(out_size, GFP_KERNEL);
294	if (!out)
295		return -ENOMEM;
296
297	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
298	MLX5_SET(query_hca_cap_in, in, other_function, 1);
299	MLX5_SET(query_hca_cap_in, in, function_id, function_id);
300	MLX5_SET(query_hca_cap_in, in, op_mod,
301		 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
302		 HCA_CAP_OPMOD_GET_CUR);
303
304	ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
305	if (ret)
306		goto err_exec;
307
308	*vhca_id = MLX5_GET(query_hca_cap_out, out,
309			    capability.cmd_hca_cap.vhca_id);
310
311err_exec:
312	kfree(out);
313	return ret;
314}
315
316static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
317			struct mlx5_vhca_data_buffer *buf,
318			struct mlx5_vhca_recv_buf *recv_buf,
319			u32 *mkey)
320{
321	size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
322				recv_buf->npages;
323	int err = 0, inlen;
324	__be64 *mtt;
325	void *mkc;
326	u32 *in;
327
328	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
329		sizeof(*mtt) * round_up(npages, 2);
330
331	in = kvzalloc(inlen, GFP_KERNEL);
332	if (!in)
333		return -ENOMEM;
334
335	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
336		 DIV_ROUND_UP(npages, 2));
337	mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
338
339	if (buf) {
340		struct sg_dma_page_iter dma_iter;
341
342		for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
343			*mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
344	} else {
345		int i;
346
347		for (i = 0; i < npages; i++)
348			*mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
349	}
350
351	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
352	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
353	MLX5_SET(mkc, mkc, lr, 1);
354	MLX5_SET(mkc, mkc, lw, 1);
355	MLX5_SET(mkc, mkc, rr, 1);
356	MLX5_SET(mkc, mkc, rw, 1);
357	MLX5_SET(mkc, mkc, pd, pdn);
358	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
359	MLX5_SET(mkc, mkc, qpn, 0xffffff);
360	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
361	MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
362	MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
363	err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
364	kvfree(in);
365	return err;
366}
367
368static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
369{
370	struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
371	struct mlx5_core_dev *mdev = mvdev->mdev;
372	int ret;
373
374	lockdep_assert_held(&mvdev->state_mutex);
375	if (mvdev->mdev_detach)
376		return -ENOTCONN;
377
378	if (buf->dmaed || !buf->allocated_length)
379		return -EINVAL;
380
381	ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
382	if (ret)
383		return ret;
384
385	ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
386	if (ret)
387		goto err;
388
389	buf->dmaed = true;
390
391	return 0;
392err:
393	dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
394	return ret;
395}
396
397void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
398{
399	struct mlx5_vf_migration_file *migf = buf->migf;
400	struct sg_page_iter sg_iter;
401
402	lockdep_assert_held(&migf->mvdev->state_mutex);
403	WARN_ON(migf->mvdev->mdev_detach);
404
405	if (buf->dmaed) {
406		mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
407		dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
408				  buf->dma_dir, 0);
409	}
410
411	/* Undo alloc_pages_bulk_array() */
412	for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
413		__free_page(sg_page_iter_page(&sg_iter));
414	sg_free_append_table(&buf->table);
415	kfree(buf);
416}
417
418static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
419				      unsigned int npages)
420{
421	unsigned int to_alloc = npages;
422	struct page **page_list;
423	unsigned long filled;
424	unsigned int to_fill;
425	int ret;
426
427	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
428	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
429	if (!page_list)
430		return -ENOMEM;
431
432	do {
433		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
434						page_list);
435		if (!filled) {
436			ret = -ENOMEM;
437			goto err;
438		}
439		to_alloc -= filled;
440		ret = sg_alloc_append_table_from_pages(
441			&buf->table, page_list, filled, 0,
442			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
443			GFP_KERNEL_ACCOUNT);
444
445		if (ret)
446			goto err;
447		buf->allocated_length += filled * PAGE_SIZE;
448		/* clean input for another bulk allocation */
449		memset(page_list, 0, filled * sizeof(*page_list));
450		to_fill = min_t(unsigned int, to_alloc,
451				PAGE_SIZE / sizeof(*page_list));
452	} while (to_alloc > 0);
453
454	kvfree(page_list);
455	return 0;
456
457err:
458	kvfree(page_list);
459	return ret;
460}
461
462struct mlx5_vhca_data_buffer *
463mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
464			 size_t length,
465			 enum dma_data_direction dma_dir)
466{
467	struct mlx5_vhca_data_buffer *buf;
468	int ret;
469
470	buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
471	if (!buf)
472		return ERR_PTR(-ENOMEM);
473
474	buf->dma_dir = dma_dir;
475	buf->migf = migf;
476	if (length) {
477		ret = mlx5vf_add_migration_pages(buf,
478				DIV_ROUND_UP_ULL(length, PAGE_SIZE));
479		if (ret)
480			goto end;
481
482		if (dma_dir != DMA_NONE) {
483			ret = mlx5vf_dma_data_buffer(buf);
484			if (ret)
485				goto end;
486		}
487	}
488
489	return buf;
490end:
491	mlx5vf_free_data_buffer(buf);
492	return ERR_PTR(ret);
493}
494
495void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
496{
497	spin_lock_irq(&buf->migf->list_lock);
498	buf->stop_copy_chunk_num = 0;
499	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
500	spin_unlock_irq(&buf->migf->list_lock);
501}
502
503struct mlx5_vhca_data_buffer *
504mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
505		       size_t length, enum dma_data_direction dma_dir)
506{
507	struct mlx5_vhca_data_buffer *buf, *temp_buf;
508	struct list_head free_list;
509
510	lockdep_assert_held(&migf->mvdev->state_mutex);
511	if (migf->mvdev->mdev_detach)
512		return ERR_PTR(-ENOTCONN);
513
514	INIT_LIST_HEAD(&free_list);
515
516	spin_lock_irq(&migf->list_lock);
517	list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
518		if (buf->dma_dir == dma_dir) {
519			list_del_init(&buf->buf_elm);
520			if (buf->allocated_length >= length) {
521				spin_unlock_irq(&migf->list_lock);
522				goto found;
523			}
524			/*
525			 * Prevent holding redundant buffers. Put in a free
526			 * list and call at the end not under the spin lock
527			 * (&migf->list_lock) to mlx5vf_free_data_buffer which
528			 * might sleep.
529			 */
530			list_add(&buf->buf_elm, &free_list);
531		}
532	}
533	spin_unlock_irq(&migf->list_lock);
534	buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
535
536found:
537	while ((temp_buf = list_first_entry_or_null(&free_list,
538				struct mlx5_vhca_data_buffer, buf_elm))) {
539		list_del(&temp_buf->buf_elm);
540		mlx5vf_free_data_buffer(temp_buf);
541	}
542
543	return buf;
544}
545
546static void
547mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
548			      struct mlx5vf_async_data *async_data)
549{
550	kvfree(async_data->out);
551	complete(&migf->save_comp);
552	fput(migf->filp);
553}
554
555void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
556{
557	struct mlx5vf_async_data *async_data = container_of(_work,
558		struct mlx5vf_async_data, work);
559	struct mlx5_vf_migration_file *migf = container_of(async_data,
560		struct mlx5_vf_migration_file, async_data);
561
562	mutex_lock(&migf->lock);
563	if (async_data->status) {
564		mlx5vf_put_data_buffer(async_data->buf);
565		if (async_data->header_buf)
566			mlx5vf_put_data_buffer(async_data->header_buf);
567		if (!async_data->stop_copy_chunk &&
568		    async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
569			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
570		else
571			migf->state = MLX5_MIGF_STATE_ERROR;
572		wake_up_interruptible(&migf->poll_wait);
573	}
574	mutex_unlock(&migf->lock);
575	mlx5vf_save_callback_complete(migf, async_data);
576}
577
578static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
579			  size_t image_size, bool initial_pre_copy)
580{
581	struct mlx5_vf_migration_file *migf = header_buf->migf;
582	struct mlx5_vf_migration_header header = {};
583	unsigned long flags;
584	struct page *page;
585	u8 *to_buff;
586
587	header.record_size = cpu_to_le64(image_size);
588	header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
589	header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
590	page = mlx5vf_get_migration_page(header_buf, 0);
591	if (!page)
592		return -EINVAL;
593	to_buff = kmap_local_page(page);
594	memcpy(to_buff, &header, sizeof(header));
595	kunmap_local(to_buff);
596	header_buf->length = sizeof(header);
597	header_buf->start_pos = header_buf->migf->max_pos;
598	migf->max_pos += header_buf->length;
599	spin_lock_irqsave(&migf->list_lock, flags);
600	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
601	spin_unlock_irqrestore(&migf->list_lock, flags);
602	if (initial_pre_copy)
603		migf->pre_copy_initial_bytes += sizeof(header);
604	return 0;
605}
606
607static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
608{
609	struct mlx5vf_async_data *async_data = container_of(context,
610			struct mlx5vf_async_data, cb_work);
611	struct mlx5_vf_migration_file *migf = container_of(async_data,
612			struct mlx5_vf_migration_file, async_data);
613
614	if (!status) {
615		size_t next_required_umem_size = 0;
616		bool stop_copy_last_chunk;
617		size_t image_size;
618		unsigned long flags;
619		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
620				!async_data->stop_copy_chunk;
621
622		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
623				      actual_image_size);
624		if (async_data->buf->stop_copy_chunk_num)
625			next_required_umem_size = MLX5_GET(save_vhca_state_out,
626					async_data->out, next_required_umem_size);
627		stop_copy_last_chunk = async_data->stop_copy_chunk &&
628				!next_required_umem_size;
629		if (async_data->header_buf) {
630			status = add_buf_header(async_data->header_buf, image_size,
631						initial_pre_copy);
632			if (status)
633				goto err;
634		}
635		async_data->buf->length = image_size;
636		async_data->buf->start_pos = migf->max_pos;
637		migf->max_pos += async_data->buf->length;
638		spin_lock_irqsave(&migf->list_lock, flags);
639		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
640		if (async_data->buf->stop_copy_chunk_num) {
641			migf->num_ready_chunks++;
642			if (next_required_umem_size &&
643			    migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
644				/* Delay the next SAVE till one chunk be consumed */
645				migf->next_required_umem_size = next_required_umem_size;
646				next_required_umem_size = 0;
647			}
648		}
649		spin_unlock_irqrestore(&migf->list_lock, flags);
650		if (initial_pre_copy) {
651			migf->pre_copy_initial_bytes += image_size;
652			migf->state = MLX5_MIGF_STATE_PRE_COPY;
653		}
654		if (stop_copy_last_chunk)
655			migf->state = MLX5_MIGF_STATE_COMPLETE;
656		wake_up_interruptible(&migf->poll_wait);
657		if (next_required_umem_size)
658			mlx5vf_mig_file_set_save_work(migf,
659				/* Picking up the next chunk num */
660				(async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
661				next_required_umem_size);
662		mlx5vf_save_callback_complete(migf, async_data);
663		return;
664	}
665
666err:
667	/* The error flow can't run from an interrupt context */
668	if (status == -EREMOTEIO) {
669		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
670		/* Failed in FW, print cmd out failure details */
671		mlx5_cmd_out_err(migf->mvdev->mdev, MLX5_CMD_OP_SAVE_VHCA_STATE, 0,
672				 async_data->out);
673	}
674
675	async_data->status = status;
676	queue_work(migf->mvdev->cb_wq, &async_data->work);
677}
678
679int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
680			       struct mlx5_vf_migration_file *migf,
681			       struct mlx5_vhca_data_buffer *buf, bool inc,
682			       bool track)
683{
684	u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
685	u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
686	struct mlx5_vhca_data_buffer *header_buf = NULL;
687	struct mlx5vf_async_data *async_data;
688	bool pre_copy_cleanup = false;
689	int err;
690
691	lockdep_assert_held(&mvdev->state_mutex);
692	if (mvdev->mdev_detach)
693		return -ENOTCONN;
694
695	err = wait_for_completion_interruptible(&migf->save_comp);
696	if (err)
697		return err;
698
699	if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
700	     migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
701		pre_copy_cleanup = true;
702
703	if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
704		/*
705		 * In case we had a PRE_COPY error, SAVE is triggered only for
706		 * the final image, read device full image.
707		 */
708		inc = false;
709
710	MLX5_SET(save_vhca_state_in, in, opcode,
711		 MLX5_CMD_OP_SAVE_VHCA_STATE);
712	MLX5_SET(save_vhca_state_in, in, op_mod, 0);
713	MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
714	MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
715	MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
716	MLX5_SET(save_vhca_state_in, in, incremental, inc);
717	MLX5_SET(save_vhca_state_in, in, set_track, track);
718
719	async_data = &migf->async_data;
720	async_data->buf = buf;
721	async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
722	async_data->out = kvzalloc(out_size, GFP_KERNEL);
723	if (!async_data->out) {
724		err = -ENOMEM;
725		goto err_out;
726	}
727
728	if (async_data->stop_copy_chunk) {
729		u8 header_idx = buf->stop_copy_chunk_num ?
730			buf->stop_copy_chunk_num - 1 : 0;
731
732		header_buf = migf->buf_header[header_idx];
733		migf->buf_header[header_idx] = NULL;
734	}
735
736	if (!header_buf) {
737		header_buf = mlx5vf_get_data_buffer(migf,
738			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
739		if (IS_ERR(header_buf)) {
740			err = PTR_ERR(header_buf);
741			goto err_free;
742		}
743	}
744
745	if (async_data->stop_copy_chunk)
746		migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
747
748	async_data->header_buf = header_buf;
749	get_file(migf->filp);
750	err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in),
751			       async_data->out,
752			       out_size, mlx5vf_save_callback,
753			       &async_data->cb_work);
754	if (err)
755		goto err_exec;
756
757	return 0;
758
759err_exec:
760	if (header_buf)
761		mlx5vf_put_data_buffer(header_buf);
762	fput(migf->filp);
763err_free:
764	kvfree(async_data->out);
765err_out:
766	complete(&migf->save_comp);
767	return err;
768}
769
770int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
771			       struct mlx5_vf_migration_file *migf,
772			       struct mlx5_vhca_data_buffer *buf)
773{
774	u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
775	u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
776	int err;
777
778	lockdep_assert_held(&mvdev->state_mutex);
779	if (mvdev->mdev_detach)
780		return -ENOTCONN;
781
782	if (!buf->dmaed) {
783		err = mlx5vf_dma_data_buffer(buf);
784		if (err)
785			return err;
786	}
787
788	MLX5_SET(load_vhca_state_in, in, opcode,
789		 MLX5_CMD_OP_LOAD_VHCA_STATE);
790	MLX5_SET(load_vhca_state_in, in, op_mod, 0);
791	MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
792	MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
793	MLX5_SET(load_vhca_state_in, in, size, buf->length);
794	return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
795}
796
797int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
798{
799	int err;
800
801	lockdep_assert_held(&migf->mvdev->state_mutex);
802	if (migf->mvdev->mdev_detach)
803		return -ENOTCONN;
804
805	err = mlx5_core_alloc_pd(migf->mvdev->mdev, &migf->pdn);
806	return err;
807}
808
809void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
810{
811	lockdep_assert_held(&migf->mvdev->state_mutex);
812	if (migf->mvdev->mdev_detach)
813		return;
814
815	mlx5_core_dealloc_pd(migf->mvdev->mdev, migf->pdn);
816}
817
818void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
819{
820	struct mlx5_vhca_data_buffer *entry;
821	int i;
822
823	lockdep_assert_held(&migf->mvdev->state_mutex);
824	WARN_ON(migf->mvdev->mdev_detach);
825
826	for (i = 0; i < MAX_NUM_CHUNKS; i++) {
827		if (migf->buf[i]) {
828			mlx5vf_free_data_buffer(migf->buf[i]);
829			migf->buf[i] = NULL;
830		}
831
832		if (migf->buf_header[i]) {
833			mlx5vf_free_data_buffer(migf->buf_header[i]);
834			migf->buf_header[i] = NULL;
835		}
836	}
837
838	list_splice(&migf->avail_list, &migf->buf_list);
839
840	while ((entry = list_first_entry_or_null(&migf->buf_list,
841				struct mlx5_vhca_data_buffer, buf_elm))) {
842		list_del(&entry->buf_elm);
843		mlx5vf_free_data_buffer(entry);
844	}
845
846	mlx5vf_cmd_dealloc_pd(migf);
847}
848
849static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
850				 struct mlx5vf_pci_core_device *mvdev,
851				 struct rb_root_cached *ranges, u32 nnodes)
852{
853	int max_num_range =
854		MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
855	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
856	int record_size = MLX5_ST_SZ_BYTES(page_track_range);
857	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
858	struct interval_tree_node *node = NULL;
859	u64 total_ranges_len = 0;
860	u32 num_ranges = nnodes;
861	u8 log_addr_space_size;
862	void *range_list_ptr;
863	void *obj_context;
864	void *cmd_hdr;
865	int inlen;
866	void *in;
867	int err;
868	int i;
869
870	if (num_ranges > max_num_range) {
871		vfio_combine_iova_ranges(ranges, nnodes, max_num_range);
872		num_ranges = max_num_range;
873	}
874
875	inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
876				 record_size * num_ranges;
877	in = kzalloc(inlen, GFP_KERNEL);
878	if (!in)
879		return -ENOMEM;
880
881	cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
882			       general_obj_in_cmd_hdr);
883	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
884		 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
885	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
886		 MLX5_OBJ_TYPE_PAGE_TRACK);
887	obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
888	MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
889	MLX5_SET(page_track, obj_context, track_type, 1);
890	MLX5_SET(page_track, obj_context, log_page_size,
891		 ilog2(tracker->host_qp->tracked_page_size));
892	MLX5_SET(page_track, obj_context, log_msg_size,
893		 ilog2(tracker->host_qp->max_msg_size));
894	MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
895	MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
896
897	range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
898	node = interval_tree_iter_first(ranges, 0, ULONG_MAX);
899	for (i = 0; i < num_ranges; i++) {
900		void *addr_range_i_base = range_list_ptr + record_size * i;
901		unsigned long length = node->last - node->start + 1;
902
903		MLX5_SET64(page_track_range, addr_range_i_base, start_address,
904			   node->start);
905		MLX5_SET64(page_track_range, addr_range_i_base, length, length);
906		total_ranges_len += length;
907		node = interval_tree_iter_next(node, 0, ULONG_MAX);
908	}
909
910	WARN_ON(node);
911	log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
912	if (log_addr_space_size <
913	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
914	    log_addr_space_size >
915	    (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
916		err = -EOPNOTSUPP;
917		goto out;
918	}
919
920	MLX5_SET(page_track, obj_context, log_addr_space_size,
921		 log_addr_space_size);
922	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
923	if (err)
924		goto out;
925
926	tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
927out:
928	kfree(in);
929	return err;
930}
931
932static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
933				      u32 tracker_id)
934{
935	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
936	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
937
938	MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
939	MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
940	MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
941
942	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
943}
944
945static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
946				     u32 tracker_id, unsigned long iova,
947				     unsigned long length, u32 tracker_state)
948{
949	u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
950	u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
951	void *obj_context;
952	void *cmd_hdr;
953
954	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
955	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
956	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
957	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
958
959	obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
960	MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
961	MLX5_SET64(page_track, obj_context, range_start_address, iova);
962	MLX5_SET64(page_track, obj_context, length, length);
963	MLX5_SET(page_track, obj_context, state, tracker_state);
964
965	return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
966}
967
968static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
969				    struct mlx5_vhca_page_tracker *tracker)
970{
971	u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
972	u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
973	void *obj_context;
974	void *cmd_hdr;
975	int err;
976
977	cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
978	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
979	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
980	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
981
982	err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out));
983	if (err)
984		return err;
985
986	obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
987	tracker->status = MLX5_GET(page_track, obj_context, state);
988	return 0;
989}
990
991static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
992			     struct mlx5_vhca_cq_buf *buf, int nent,
993			     int cqe_size)
994{
995	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
996	u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
997	u8 log_wq_sz = ilog2(cqe_size);
998	int err;
999
1000	err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf,
1001				       mdev->priv.numa_node);
1002	if (err)
1003		return err;
1004
1005	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
1006	buf->cqe_size = cqe_size;
1007	buf->nent = nent;
1008	return 0;
1009}
1010
1011static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
1012{
1013	struct mlx5_cqe64 *cqe64;
1014	void *cqe;
1015	int i;
1016
1017	for (i = 0; i < buf->nent; i++) {
1018		cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i);
1019		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
1020		cqe64->op_own = MLX5_CQE_INVALID << 4;
1021	}
1022}
1023
1024static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
1025			      struct mlx5_vhca_cq *cq)
1026{
1027	mlx5_core_destroy_cq(mdev, &cq->mcq);
1028	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1029	mlx5_db_free(mdev, &cq->db);
1030}
1031
1032static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
1033{
1034	if (type != MLX5_EVENT_TYPE_CQ_ERROR)
1035		return;
1036
1037	set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
1038				       tracker.cq.mcq));
1039}
1040
1041static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
1042				 void *data)
1043{
1044	struct mlx5_vhca_page_tracker *tracker =
1045		mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
1046	struct mlx5vf_pci_core_device *mvdev = container_of(
1047		tracker, struct mlx5vf_pci_core_device, tracker);
1048	struct mlx5_eqe_obj_change *object;
1049	struct mlx5_eqe *eqe = data;
1050	u8 event_type = (u8)type;
1051	u8 queue_type;
1052	u32 obj_id;
1053	int qp_num;
1054
1055	switch (event_type) {
1056	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
1057	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
1058	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
1059		queue_type = eqe->data.qp_srq.type;
1060		if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
1061			break;
1062		qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
1063		if (qp_num != tracker->host_qp->qpn &&
1064		    qp_num != tracker->fw_qp->qpn)
1065			break;
1066		set_tracker_error(mvdev);
1067		break;
1068	case MLX5_EVENT_TYPE_OBJECT_CHANGE:
1069		object = &eqe->data.obj_change;
1070		obj_id = be32_to_cpu(object->obj_id);
1071		if (obj_id == tracker->id)
1072			set_tracker_change_event(mvdev);
1073		break;
1074	default:
1075		break;
1076	}
1077
1078	return NOTIFY_OK;
1079}
1080
1081static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
1082			       struct mlx5_eqe *eqe)
1083{
1084	struct mlx5vf_pci_core_device *mvdev =
1085		container_of(mcq, struct mlx5vf_pci_core_device,
1086			     tracker.cq.mcq);
1087
1088	complete(&mvdev->tracker_comp);
1089}
1090
1091static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
1092			    struct mlx5_vhca_page_tracker *tracker,
1093			    size_t ncqe)
1094{
1095	int cqe_size = cache_line_size() == 128 ? 128 : 64;
1096	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
1097	struct mlx5_vhca_cq *cq;
1098	int inlen, err, eqn;
1099	void *cqc, *in;
1100	__be64 *pas;
1101	int vector;
1102
1103	cq = &tracker->cq;
1104	ncqe = roundup_pow_of_two(ncqe);
1105	err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node);
1106	if (err)
1107		return err;
1108
1109	cq->ncqe = ncqe;
1110	cq->mcq.set_ci_db = cq->db.db;
1111	cq->mcq.arm_db = cq->db.db + 1;
1112	cq->mcq.cqe_sz = cqe_size;
1113	err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size);
1114	if (err)
1115		goto err_db_free;
1116
1117	init_cq_frag_buf(&cq->buf);
1118	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1119		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1120		cq->buf.frag_buf.npages;
1121	in = kvzalloc(inlen, GFP_KERNEL);
1122	if (!in) {
1123		err = -ENOMEM;
1124		goto err_buff;
1125	}
1126
1127	vector = raw_smp_processor_id() % mlx5_comp_vectors_max(mdev);
1128	err = mlx5_comp_eqn_get(mdev, vector, &eqn);
1129	if (err)
1130		goto err_vec;
1131
1132	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1133	MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1134	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1135	MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1136	MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1137		 MLX5_ADAPTER_PAGE_SHIFT);
1138	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1139	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1140	mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas);
1141	cq->mcq.comp = mlx5vf_cq_complete;
1142	cq->mcq.event = mlx5vf_cq_event;
1143	err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out));
1144	if (err)
1145		goto err_vec;
1146
1147	mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1148		    cq->mcq.cons_index);
1149	kvfree(in);
1150	return 0;
1151
1152err_vec:
1153	kvfree(in);
1154err_buff:
1155	mlx5_frag_buf_free(mdev, &cq->buf.frag_buf);
1156err_db_free:
1157	mlx5_db_free(mdev, &cq->db);
1158	return err;
1159}
1160
1161static struct mlx5_vhca_qp *
1162mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1163		    struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1164{
1165	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1166	struct mlx5_vhca_qp *qp;
1167	u8 log_rq_stride;
1168	u8 log_rq_sz;
1169	void *qpc;
1170	int inlen;
1171	void *in;
1172	int err;
1173
1174	qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT);
1175	if (!qp)
1176		return ERR_PTR(-ENOMEM);
1177
1178	err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node);
1179	if (err)
1180		goto err_free;
1181
1182	if (max_recv_wr) {
1183		qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1184		log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1185		log_rq_sz = ilog2(qp->rq.wqe_cnt);
1186		err = mlx5_frag_buf_alloc_node(mdev,
1187			wq_get_byte_sz(log_rq_sz, log_rq_stride),
1188			&qp->buf, mdev->priv.numa_node);
1189		if (err)
1190			goto err_db_free;
1191		mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc);
1192	}
1193
1194	qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1195	inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1196		MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1197		qp->buf.npages;
1198	in = kvzalloc(inlen, GFP_KERNEL);
1199	if (!in) {
1200		err = -ENOMEM;
1201		goto err_in;
1202	}
1203
1204	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1205	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1206	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1207	MLX5_SET(qpc, qpc, pd, tracker->pdn);
1208	MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1209	MLX5_SET(qpc, qpc, log_page_size,
1210		 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1211	MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1212	if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1213		MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1214	MLX5_SET(qpc, qpc, no_sq, 1);
1215	if (max_recv_wr) {
1216		MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1217		MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1218		MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1219		MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1220		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1221		mlx5_fill_page_frag_array(&qp->buf,
1222					  (__be64 *)MLX5_ADDR_OF(create_qp_in,
1223								 in, pas));
1224	} else {
1225		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1226	}
1227
1228	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1229	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
1230	kvfree(in);
1231	if (err)
1232		goto err_in;
1233
1234	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1235	return qp;
1236
1237err_in:
1238	if (max_recv_wr)
1239		mlx5_frag_buf_free(mdev, &qp->buf);
1240err_db_free:
1241	mlx5_db_free(mdev, &qp->db);
1242err_free:
1243	kfree(qp);
1244	return ERR_PTR(err);
1245}
1246
1247static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1248{
1249	struct mlx5_wqe_data_seg *data;
1250	unsigned int ix;
1251
1252	WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1253	ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1254	data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix);
1255	data->byte_count = cpu_to_be32(qp->max_msg_size);
1256	data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1257	data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1258	qp->rq.pc++;
1259	/* Make sure that descriptors are written before doorbell record. */
1260	dma_wmb();
1261	*qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1262}
1263
1264static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1265			      struct mlx5_vhca_qp *qp, u32 remote_qpn,
1266			      bool host_qp)
1267{
1268	u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1269	u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1270	u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1271	void *qpc;
1272	int ret;
1273
1274	/* Init */
1275	qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1276	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1277	MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1278	MLX5_SET(qpc, qpc, rre, 1);
1279	MLX5_SET(qpc, qpc, rwe, 1);
1280	MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1281	MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1282	ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1283	if (ret)
1284		return ret;
1285
1286	if (host_qp) {
1287		struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1288		int i;
1289
1290		for (i = 0; i < qp->rq.wqe_cnt; i++) {
1291			mlx5vf_post_recv(qp);
1292			recv_buf->next_rq_offset += qp->max_msg_size;
1293		}
1294	}
1295
1296	/* RTR */
1297	qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1298	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1299	MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1300	MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1301	MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1302	MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1303	MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1304	MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1305	MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1306	MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1307	ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1308	if (ret || host_qp)
1309		return ret;
1310
1311	/* RTS */
1312	qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1313	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1314	MLX5_SET(qpc, qpc, retry_count, 7);
1315	MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1316	MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1317	MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1318	MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1319
1320	return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1321}
1322
1323static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1324			      struct mlx5_vhca_qp *qp)
1325{
1326	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1327
1328	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1329	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1330	mlx5_cmd_exec_in(mdev, destroy_qp, in);
1331
1332	mlx5_frag_buf_free(mdev, &qp->buf);
1333	mlx5_db_free(mdev, &qp->db);
1334	kfree(qp);
1335}
1336
1337static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1338{
1339	int i;
1340
1341	/* Undo alloc_pages_bulk_array() */
1342	for (i = 0; i < recv_buf->npages; i++)
1343		__free_page(recv_buf->page_list[i]);
1344
1345	kvfree(recv_buf->page_list);
1346}
1347
1348static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1349			    unsigned int npages)
1350{
1351	unsigned int filled = 0, done = 0;
1352	int i;
1353
1354	recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
1355				       GFP_KERNEL_ACCOUNT);
1356	if (!recv_buf->page_list)
1357		return -ENOMEM;
1358
1359	for (;;) {
1360		filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1361						npages - done,
1362						recv_buf->page_list + done);
1363		if (!filled)
1364			goto err;
1365
1366		done += filled;
1367		if (done == npages)
1368			break;
1369	}
1370
1371	recv_buf->npages = npages;
1372	return 0;
1373
1374err:
1375	for (i = 0; i < npages; i++) {
1376		if (recv_buf->page_list[i])
1377			__free_page(recv_buf->page_list[i]);
1378	}
1379
1380	kvfree(recv_buf->page_list);
1381	return -ENOMEM;
1382}
1383
1384static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1385				   struct mlx5_vhca_recv_buf *recv_buf)
1386{
1387	int i, j;
1388
1389	recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
1390				       sizeof(*recv_buf->dma_addrs),
1391				       GFP_KERNEL_ACCOUNT);
1392	if (!recv_buf->dma_addrs)
1393		return -ENOMEM;
1394
1395	for (i = 0; i < recv_buf->npages; i++) {
1396		recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1397						      recv_buf->page_list[i],
1398						      0, PAGE_SIZE,
1399						      DMA_FROM_DEVICE);
1400		if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
1401			goto error;
1402	}
1403	return 0;
1404
1405error:
1406	for (j = 0; j < i; j++)
1407		dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1408				 PAGE_SIZE, DMA_FROM_DEVICE);
1409
1410	kvfree(recv_buf->dma_addrs);
1411	return -ENOMEM;
1412}
1413
1414static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1415				      struct mlx5_vhca_recv_buf *recv_buf)
1416{
1417	int i;
1418
1419	for (i = 0; i < recv_buf->npages; i++)
1420		dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1421				 PAGE_SIZE, DMA_FROM_DEVICE);
1422
1423	kvfree(recv_buf->dma_addrs);
1424}
1425
1426static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1427					  struct mlx5_vhca_qp *qp)
1428{
1429	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1430
1431	mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
1432	unregister_dma_recv_pages(mdev, recv_buf);
1433	free_recv_pages(&qp->recv_buf);
1434}
1435
1436static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1437					  struct mlx5_vhca_qp *qp, u32 pdn,
1438					  u64 rq_size)
1439{
1440	unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1441	struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1442	int err;
1443
1444	err = alloc_recv_pages(recv_buf, npages);
1445	if (err < 0)
1446		return err;
1447
1448	err = register_dma_recv_pages(mdev, recv_buf);
1449	if (err)
1450		goto end;
1451
1452	err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
1453	if (err)
1454		goto err_create_mkey;
1455
1456	return 0;
1457
1458err_create_mkey:
1459	unregister_dma_recv_pages(mdev, recv_buf);
1460end:
1461	free_recv_pages(recv_buf);
1462	return err;
1463}
1464
1465static void
1466_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1467{
1468	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1469	struct mlx5_core_dev *mdev = mvdev->mdev;
1470
1471	lockdep_assert_held(&mvdev->state_mutex);
1472
1473	if (!mvdev->log_active)
1474		return;
1475
1476	WARN_ON(mvdev->mdev_detach);
1477
1478	mlx5_eq_notifier_unregister(mdev, &tracker->nb);
1479	mlx5vf_cmd_destroy_tracker(mdev, tracker->id);
1480	mlx5vf_destroy_qp(mdev, tracker->fw_qp);
1481	mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp);
1482	mlx5vf_destroy_qp(mdev, tracker->host_qp);
1483	mlx5vf_destroy_cq(mdev, &tracker->cq);
1484	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1485	mlx5_put_uars_page(mdev, tracker->uar);
1486	mvdev->log_active = false;
1487}
1488
1489int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1490{
1491	struct mlx5vf_pci_core_device *mvdev = container_of(
1492		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1493
1494	mutex_lock(&mvdev->state_mutex);
1495	if (!mvdev->log_active)
1496		goto end;
1497
1498	_mlx5vf_free_page_tracker_resources(mvdev);
1499	mvdev->log_active = false;
1500end:
1501	mlx5vf_state_mutex_unlock(mvdev);
1502	return 0;
1503}
1504
1505int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1506			      struct rb_root_cached *ranges, u32 nnodes,
1507			      u64 *page_size)
1508{
1509	struct mlx5vf_pci_core_device *mvdev = container_of(
1510		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1511	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1512	u8 log_tracked_page = ilog2(*page_size);
1513	struct mlx5_vhca_qp *host_qp;
1514	struct mlx5_vhca_qp *fw_qp;
1515	struct mlx5_core_dev *mdev;
1516	u32 max_msg_size = PAGE_SIZE;
1517	u64 rq_size = SZ_2M;
1518	u32 max_recv_wr;
1519	int err;
1520
1521	mutex_lock(&mvdev->state_mutex);
1522	if (mvdev->mdev_detach) {
1523		err = -ENOTCONN;
1524		goto end;
1525	}
1526
1527	if (mvdev->log_active) {
1528		err = -EINVAL;
1529		goto end;
1530	}
1531
1532	mdev = mvdev->mdev;
1533	memset(tracker, 0, sizeof(*tracker));
1534	tracker->uar = mlx5_get_uars_page(mdev);
1535	if (IS_ERR(tracker->uar)) {
1536		err = PTR_ERR(tracker->uar);
1537		goto end;
1538	}
1539
1540	err = mlx5_core_alloc_pd(mdev, &tracker->pdn);
1541	if (err)
1542		goto err_uar;
1543
1544	max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1545	err = mlx5vf_create_cq(mdev, tracker, max_recv_wr);
1546	if (err)
1547		goto err_dealloc_pd;
1548
1549	host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1550	if (IS_ERR(host_qp)) {
1551		err = PTR_ERR(host_qp);
1552		goto err_cq;
1553	}
1554
1555	host_qp->max_msg_size = max_msg_size;
1556	if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1557				pg_track_log_min_page_size)) {
1558		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1559				pg_track_log_min_page_size);
1560	} else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1561				pg_track_log_max_page_size)) {
1562		log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1563				pg_track_log_max_page_size);
1564	}
1565
1566	host_qp->tracked_page_size = (1ULL << log_tracked_page);
1567	err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn,
1568					     rq_size);
1569	if (err)
1570		goto err_host_qp;
1571
1572	fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0);
1573	if (IS_ERR(fw_qp)) {
1574		err = PTR_ERR(fw_qp);
1575		goto err_recv_resources;
1576	}
1577
1578	err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true);
1579	if (err)
1580		goto err_activate;
1581
1582	err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false);
1583	if (err)
1584		goto err_activate;
1585
1586	tracker->host_qp = host_qp;
1587	tracker->fw_qp = fw_qp;
1588	err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1589	if (err)
1590		goto err_activate;
1591
1592	MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1593	mlx5_eq_notifier_register(mdev, &tracker->nb);
1594	*page_size = host_qp->tracked_page_size;
1595	mvdev->log_active = true;
1596	mlx5vf_state_mutex_unlock(mvdev);
1597	return 0;
1598
1599err_activate:
1600	mlx5vf_destroy_qp(mdev, fw_qp);
1601err_recv_resources:
1602	mlx5vf_free_qp_recv_resources(mdev, host_qp);
1603err_host_qp:
1604	mlx5vf_destroy_qp(mdev, host_qp);
1605err_cq:
1606	mlx5vf_destroy_cq(mdev, &tracker->cq);
1607err_dealloc_pd:
1608	mlx5_core_dealloc_pd(mdev, tracker->pdn);
1609err_uar:
1610	mlx5_put_uars_page(mdev, tracker->uar);
1611end:
1612	mlx5vf_state_mutex_unlock(mvdev);
1613	return err;
1614}
1615
1616static void
1617set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1618		  struct iova_bitmap *dirty)
1619{
1620	u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1621	u32 nent = size / entry_size;
1622	struct page *page;
1623	u64 addr;
1624	u64 *buf;
1625	int i;
1626
1627	if (WARN_ON(index >= qp->recv_buf.npages ||
1628		    (nent > qp->max_msg_size / entry_size)))
1629		return;
1630
1631	page = qp->recv_buf.page_list[index];
1632	buf = kmap_local_page(page);
1633	for (i = 0; i < nent; i++) {
1634		addr = MLX5_GET(page_track_report_entry, buf + i,
1635				dirty_address_low);
1636		addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1637				      dirty_address_high) << 32;
1638		iova_bitmap_set(dirty, addr, qp->tracked_page_size);
1639	}
1640	kunmap_local(buf);
1641}
1642
1643static void
1644mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1645	      struct iova_bitmap *dirty, int *tracker_status)
1646{
1647	u32 size;
1648	int ix;
1649
1650	qp->rq.cc++;
1651	*tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1652	size = be32_to_cpu(cqe->byte_cnt);
1653	ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1654
1655	/* zero length CQE, no data */
1656	WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1657	if (size)
1658		set_report_output(size, ix, qp, dirty);
1659
1660	qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1661	mlx5vf_post_recv(qp);
1662}
1663
1664static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1665{
1666	return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n);
1667}
1668
1669static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1670{
1671	void *cqe = get_cqe(cq, n & (cq->ncqe - 1));
1672	struct mlx5_cqe64 *cqe64;
1673
1674	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1675
1676	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1677	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1678		return cqe64;
1679	} else {
1680		return NULL;
1681	}
1682}
1683
1684static int
1685mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1686		   struct iova_bitmap *dirty, int *tracker_status)
1687{
1688	struct mlx5_cqe64 *cqe;
1689	u8 opcode;
1690
1691	cqe = get_sw_cqe(cq, cq->mcq.cons_index);
1692	if (!cqe)
1693		return CQ_EMPTY;
1694
1695	++cq->mcq.cons_index;
1696	/*
1697	 * Make sure we read CQ entry contents after we've checked the
1698	 * ownership bit.
1699	 */
1700	rmb();
1701	opcode = get_cqe_opcode(cqe);
1702	switch (opcode) {
1703	case MLX5_CQE_RESP_SEND_IMM:
1704		mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1705		return CQ_OK;
1706	default:
1707		return CQ_POLL_ERR;
1708	}
1709}
1710
1711int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1712				  unsigned long length,
1713				  struct iova_bitmap *dirty)
1714{
1715	struct mlx5vf_pci_core_device *mvdev = container_of(
1716		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1717	struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1718	struct mlx5_vhca_cq *cq = &tracker->cq;
1719	struct mlx5_core_dev *mdev;
1720	int poll_err, err;
1721
1722	mutex_lock(&mvdev->state_mutex);
1723	if (!mvdev->log_active) {
1724		err = -EINVAL;
1725		goto end;
1726	}
1727
1728	if (mvdev->mdev_detach) {
1729		err = -ENOTCONN;
1730		goto end;
1731	}
1732
1733	if (tracker->is_err) {
1734		err = -EIO;
1735		goto end;
1736	}
1737
1738	mdev = mvdev->mdev;
1739	err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length,
1740					MLX5_PAGE_TRACK_STATE_REPORTING);
1741	if (err)
1742		goto end;
1743
1744	tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1745	while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1746	       !tracker->is_err) {
1747		poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty,
1748					      &tracker->status);
1749		if (poll_err == CQ_EMPTY) {
1750			mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map,
1751				    cq->mcq.cons_index);
1752			poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp,
1753						      dirty, &tracker->status);
1754			if (poll_err == CQ_EMPTY) {
1755				wait_for_completion(&mvdev->tracker_comp);
1756				if (tracker->object_changed) {
1757					tracker->object_changed = false;
1758					err = mlx5vf_cmd_query_tracker(mdev, tracker);
1759					if (err)
1760						goto end;
1761				}
1762				continue;
1763			}
1764		}
1765		if (poll_err == CQ_POLL_ERR) {
1766			err = -EIO;
1767			goto end;
1768		}
1769		mlx5_cq_set_ci(&cq->mcq);
1770	}
1771
1772	if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1773		tracker->is_err = true;
1774
1775	if (tracker->is_err)
1776		err = -EIO;
1777end:
1778	mlx5vf_state_mutex_unlock(mvdev);
1779	return err;
1780}
1781