1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2/*
3 * Copyright(c) 2020 Cornelis Networks, Inc.
4 * Copyright(c) 2015-2018 Intel Corporation.
5 */
6#include <asm/page.h>
7#include <linux/string.h>
8
9#include "mmu_rb.h"
10#include "user_exp_rcv.h"
11#include "trace.h"
12
13static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
14			    struct exp_tid_set *set,
15			    struct hfi1_filedata *fd);
16static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
17static int set_rcvarray_entry(struct hfi1_filedata *fd,
18			      struct tid_user_buf *tbuf,
19			      u32 rcventry, struct tid_group *grp,
20			      u16 pageidx, unsigned int npages);
21static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
22				    struct tid_rb_node *tnode);
23static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
24			      const struct mmu_notifier_range *range,
25			      unsigned long cur_seq);
26static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
27			         const struct mmu_notifier_range *range,
28			         unsigned long cur_seq);
29static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
30			    struct tid_group *grp, u16 count,
31			    u32 *tidlist, unsigned int *tididx,
32			    unsigned int *pmapped);
33static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
34static void __clear_tid_node(struct hfi1_filedata *fd,
35			     struct tid_rb_node *node);
36static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
37
38static const struct mmu_interval_notifier_ops tid_mn_ops = {
39	.invalidate = tid_rb_invalidate,
40};
41static const struct mmu_interval_notifier_ops tid_cover_ops = {
42	.invalidate = tid_cover_invalidate,
43};
44
45/*
46 * Initialize context and file private data needed for Expected
47 * receive caching. This needs to be done after the context has
48 * been configured with the eager/expected RcvEntry counts.
49 */
50int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
51			   struct hfi1_ctxtdata *uctxt)
52{
53	int ret = 0;
54
55	fd->entry_to_rb = kcalloc(uctxt->expected_count,
56				  sizeof(struct rb_node *),
57				  GFP_KERNEL);
58	if (!fd->entry_to_rb)
59		return -ENOMEM;
60
61	if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
62		fd->invalid_tid_idx = 0;
63		fd->invalid_tids = kcalloc(uctxt->expected_count,
64					   sizeof(*fd->invalid_tids),
65					   GFP_KERNEL);
66		if (!fd->invalid_tids) {
67			kfree(fd->entry_to_rb);
68			fd->entry_to_rb = NULL;
69			return -ENOMEM;
70		}
71		fd->use_mn = true;
72	}
73
74	/*
75	 * PSM does not have a good way to separate, count, and
76	 * effectively enforce a limit on RcvArray entries used by
77	 * subctxts (when context sharing is used) when TID caching
78	 * is enabled. To help with that, we calculate a per-process
79	 * RcvArray entry share and enforce that.
80	 * If TID caching is not in use, PSM deals with usage on its
81	 * own. In that case, we allow any subctxt to take all of the
82	 * entries.
83	 *
84	 * Make sure that we set the tid counts only after successful
85	 * init.
86	 */
87	spin_lock(&fd->tid_lock);
88	if (uctxt->subctxt_cnt && fd->use_mn) {
89		u16 remainder;
90
91		fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
92		remainder = uctxt->expected_count % uctxt->subctxt_cnt;
93		if (remainder && fd->subctxt < remainder)
94			fd->tid_limit++;
95	} else {
96		fd->tid_limit = uctxt->expected_count;
97	}
98	spin_unlock(&fd->tid_lock);
99
100	return ret;
101}
102
103void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
104{
105	struct hfi1_ctxtdata *uctxt = fd->uctxt;
106
107	mutex_lock(&uctxt->exp_mutex);
108	if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
109		unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
110	if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
111		unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
112	mutex_unlock(&uctxt->exp_mutex);
113
114	kfree(fd->invalid_tids);
115	fd->invalid_tids = NULL;
116
117	kfree(fd->entry_to_rb);
118	fd->entry_to_rb = NULL;
119}
120
121/*
122 * Release pinned receive buffer pages.
123 *
124 * @mapped: true if the pages have been DMA mapped. false otherwise.
125 * @idx: Index of the first page to unpin.
126 * @npages: No of pages to unpin.
127 *
128 * If the pages have been DMA mapped (indicated by mapped parameter), their
129 * info will be passed via a struct tid_rb_node. If they haven't been mapped,
130 * their info will be passed via a struct tid_user_buf.
131 */
132static void unpin_rcv_pages(struct hfi1_filedata *fd,
133			    struct tid_user_buf *tidbuf,
134			    struct tid_rb_node *node,
135			    unsigned int idx,
136			    unsigned int npages,
137			    bool mapped)
138{
139	struct page **pages;
140	struct hfi1_devdata *dd = fd->uctxt->dd;
141	struct mm_struct *mm;
142
143	if (mapped) {
144		dma_unmap_single(&dd->pcidev->dev, node->dma_addr,
145				 node->npages * PAGE_SIZE, DMA_FROM_DEVICE);
146		pages = &node->pages[idx];
147		mm = mm_from_tid_node(node);
148	} else {
149		pages = &tidbuf->pages[idx];
150		mm = current->mm;
151	}
152	hfi1_release_user_pages(mm, pages, npages, mapped);
153	fd->tid_n_pinned -= npages;
154}
155
156/*
157 * Pin receive buffer pages.
158 */
159static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
160{
161	int pinned;
162	unsigned int npages = tidbuf->npages;
163	unsigned long vaddr = tidbuf->vaddr;
164	struct page **pages = NULL;
165	struct hfi1_devdata *dd = fd->uctxt->dd;
166
167	if (npages > fd->uctxt->expected_count) {
168		dd_dev_err(dd, "Expected buffer too big\n");
169		return -EINVAL;
170	}
171
172	/* Allocate the array of struct page pointers needed for pinning */
173	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
174	if (!pages)
175		return -ENOMEM;
176
177	/*
178	 * Pin all the pages of the user buffer. If we can't pin all the
179	 * pages, accept the amount pinned so far and program only that.
180	 * User space knows how to deal with partially programmed buffers.
181	 */
182	if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
183		kfree(pages);
184		return -ENOMEM;
185	}
186
187	pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
188	if (pinned <= 0) {
189		kfree(pages);
190		return pinned;
191	}
192	tidbuf->pages = pages;
193	fd->tid_n_pinned += pinned;
194	return pinned;
195}
196
197/*
198 * RcvArray entry allocation for Expected Receives is done by the
199 * following algorithm:
200 *
201 * The context keeps 3 lists of groups of RcvArray entries:
202 *   1. List of empty groups - tid_group_list
203 *      This list is created during user context creation and
204 *      contains elements which describe sets (of 8) of empty
205 *      RcvArray entries.
206 *   2. List of partially used groups - tid_used_list
207 *      This list contains sets of RcvArray entries which are
208 *      not completely used up. Another mapping request could
209 *      use some of all of the remaining entries.
210 *   3. List of full groups - tid_full_list
211 *      This is the list where sets that are completely used
212 *      up go.
213 *
214 * An attempt to optimize the usage of RcvArray entries is
215 * made by finding all sets of physically contiguous pages in a
216 * user's buffer.
217 * These physically contiguous sets are further split into
218 * sizes supported by the receive engine of the HFI. The
219 * resulting sets of pages are stored in struct tid_pageset,
220 * which describes the sets as:
221 *    * .count - number of pages in this set
222 *    * .idx - starting index into struct page ** array
223 *                    of this set
224 *
225 * From this point on, the algorithm deals with the page sets
226 * described above. The number of pagesets is divided by the
227 * RcvArray group size to produce the number of full groups
228 * needed.
229 *
230 * Groups from the 3 lists are manipulated using the following
231 * rules:
232 *   1. For each set of 8 pagesets, a complete group from
233 *      tid_group_list is taken, programmed, and moved to
234 *      the tid_full_list list.
235 *   2. For all remaining pagesets:
236 *      2.1 If the tid_used_list is empty and the tid_group_list
237 *          is empty, stop processing pageset and return only
238 *          what has been programmed up to this point.
239 *      2.2 If the tid_used_list is empty and the tid_group_list
240 *          is not empty, move a group from tid_group_list to
241 *          tid_used_list.
242 *      2.3 For each group is tid_used_group, program as much as
243 *          can fit into the group. If the group becomes fully
244 *          used, move it to tid_full_list.
245 */
246int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
247			    struct hfi1_tid_info *tinfo)
248{
249	int ret = 0, need_group = 0, pinned;
250	struct hfi1_ctxtdata *uctxt = fd->uctxt;
251	struct hfi1_devdata *dd = uctxt->dd;
252	unsigned int ngroups, pageset_count,
253		tididx = 0, mapped, mapped_pages = 0;
254	u32 *tidlist = NULL;
255	struct tid_user_buf *tidbuf;
256	unsigned long mmu_seq = 0;
257
258	if (!PAGE_ALIGNED(tinfo->vaddr))
259		return -EINVAL;
260	if (tinfo->length == 0)
261		return -EINVAL;
262
263	tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
264	if (!tidbuf)
265		return -ENOMEM;
266
267	mutex_init(&tidbuf->cover_mutex);
268	tidbuf->vaddr = tinfo->vaddr;
269	tidbuf->length = tinfo->length;
270	tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length);
271	tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
272				GFP_KERNEL);
273	if (!tidbuf->psets) {
274		ret = -ENOMEM;
275		goto fail_release_mem;
276	}
277
278	if (fd->use_mn) {
279		ret = mmu_interval_notifier_insert(
280			&tidbuf->notifier, current->mm,
281			tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
282			&tid_cover_ops);
283		if (ret)
284			goto fail_release_mem;
285		mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
286	}
287
288	pinned = pin_rcv_pages(fd, tidbuf);
289	if (pinned <= 0) {
290		ret = (pinned < 0) ? pinned : -ENOSPC;
291		goto fail_unpin;
292	}
293
294	/* Find sets of physically contiguous pages */
295	tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
296
297	/* Reserve the number of expected tids to be used. */
298	spin_lock(&fd->tid_lock);
299	if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
300		pageset_count = fd->tid_limit - fd->tid_used;
301	else
302		pageset_count = tidbuf->n_psets;
303	fd->tid_used += pageset_count;
304	spin_unlock(&fd->tid_lock);
305
306	if (!pageset_count) {
307		ret = -ENOSPC;
308		goto fail_unreserve;
309	}
310
311	ngroups = pageset_count / dd->rcv_entries.group_size;
312	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
313	if (!tidlist) {
314		ret = -ENOMEM;
315		goto fail_unreserve;
316	}
317
318	tididx = 0;
319
320	/*
321	 * From this point on, we are going to be using shared (between master
322	 * and subcontexts) context resources. We need to take the lock.
323	 */
324	mutex_lock(&uctxt->exp_mutex);
325	/*
326	 * The first step is to program the RcvArray entries which are complete
327	 * groups.
328	 */
329	while (ngroups && uctxt->tid_group_list.count) {
330		struct tid_group *grp =
331			tid_group_pop(&uctxt->tid_group_list);
332
333		ret = program_rcvarray(fd, tidbuf, grp,
334				       dd->rcv_entries.group_size,
335				       tidlist, &tididx, &mapped);
336		/*
337		 * If there was a failure to program the RcvArray
338		 * entries for the entire group, reset the grp fields
339		 * and add the grp back to the free group list.
340		 */
341		if (ret <= 0) {
342			tid_group_add_tail(grp, &uctxt->tid_group_list);
343			hfi1_cdbg(TID,
344				  "Failed to program RcvArray group %d", ret);
345			goto unlock;
346		}
347
348		tid_group_add_tail(grp, &uctxt->tid_full_list);
349		ngroups--;
350		mapped_pages += mapped;
351	}
352
353	while (tididx < pageset_count) {
354		struct tid_group *grp, *ptr;
355		/*
356		 * If we don't have any partially used tid groups, check
357		 * if we have empty groups. If so, take one from there and
358		 * put in the partially used list.
359		 */
360		if (!uctxt->tid_used_list.count || need_group) {
361			if (!uctxt->tid_group_list.count)
362				goto unlock;
363
364			grp = tid_group_pop(&uctxt->tid_group_list);
365			tid_group_add_tail(grp, &uctxt->tid_used_list);
366			need_group = 0;
367		}
368		/*
369		 * There is an optimization opportunity here - instead of
370		 * fitting as many page sets as we can, check for a group
371		 * later on in the list that could fit all of them.
372		 */
373		list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
374					 list) {
375			unsigned use = min_t(unsigned, pageset_count - tididx,
376					     grp->size - grp->used);
377
378			ret = program_rcvarray(fd, tidbuf, grp,
379					       use, tidlist,
380					       &tididx, &mapped);
381			if (ret < 0) {
382				hfi1_cdbg(TID,
383					  "Failed to program RcvArray entries %d",
384					  ret);
385				goto unlock;
386			} else if (ret > 0) {
387				if (grp->used == grp->size)
388					tid_group_move(grp,
389						       &uctxt->tid_used_list,
390						       &uctxt->tid_full_list);
391				mapped_pages += mapped;
392				need_group = 0;
393				/* Check if we are done so we break out early */
394				if (tididx >= pageset_count)
395					break;
396			} else if (WARN_ON(ret == 0)) {
397				/*
398				 * If ret is 0, we did not program any entries
399				 * into this group, which can only happen if
400				 * we've screwed up the accounting somewhere.
401				 * Warn and try to continue.
402				 */
403				need_group = 1;
404			}
405		}
406	}
407unlock:
408	mutex_unlock(&uctxt->exp_mutex);
409	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
410		  mapped_pages, ret);
411
412	/* fail if nothing was programmed, set error if none provided */
413	if (tididx == 0) {
414		if (ret >= 0)
415			ret = -ENOSPC;
416		goto fail_unreserve;
417	}
418
419	/* adjust reserved tid_used to actual count */
420	spin_lock(&fd->tid_lock);
421	fd->tid_used -= pageset_count - tididx;
422	spin_unlock(&fd->tid_lock);
423
424	/* unpin all pages not covered by a TID */
425	unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
426			false);
427
428	if (fd->use_mn) {
429		/* check for an invalidate during setup */
430		bool fail = false;
431
432		mutex_lock(&tidbuf->cover_mutex);
433		fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
434		mutex_unlock(&tidbuf->cover_mutex);
435
436		if (fail) {
437			ret = -EBUSY;
438			goto fail_unprogram;
439		}
440	}
441
442	tinfo->tidcnt = tididx;
443	tinfo->length = mapped_pages * PAGE_SIZE;
444
445	if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
446			 tidlist, sizeof(tidlist[0]) * tididx)) {
447		ret = -EFAULT;
448		goto fail_unprogram;
449	}
450
451	if (fd->use_mn)
452		mmu_interval_notifier_remove(&tidbuf->notifier);
453	kfree(tidbuf->pages);
454	kfree(tidbuf->psets);
455	kfree(tidbuf);
456	kfree(tidlist);
457	return 0;
458
459fail_unprogram:
460	/* unprogram, unmap, and unpin all allocated TIDs */
461	tinfo->tidlist = (unsigned long)tidlist;
462	hfi1_user_exp_rcv_clear(fd, tinfo);
463	tinfo->tidlist = 0;
464	pinned = 0;		/* nothing left to unpin */
465	pageset_count = 0;	/* nothing left reserved */
466fail_unreserve:
467	spin_lock(&fd->tid_lock);
468	fd->tid_used -= pageset_count;
469	spin_unlock(&fd->tid_lock);
470fail_unpin:
471	if (fd->use_mn)
472		mmu_interval_notifier_remove(&tidbuf->notifier);
473	if (pinned > 0)
474		unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
475fail_release_mem:
476	kfree(tidbuf->pages);
477	kfree(tidbuf->psets);
478	kfree(tidbuf);
479	kfree(tidlist);
480	return ret;
481}
482
483int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
484			    struct hfi1_tid_info *tinfo)
485{
486	int ret = 0;
487	struct hfi1_ctxtdata *uctxt = fd->uctxt;
488	u32 *tidinfo;
489	unsigned tididx;
490
491	if (unlikely(tinfo->tidcnt > fd->tid_used))
492		return -EINVAL;
493
494	tidinfo = memdup_array_user(u64_to_user_ptr(tinfo->tidlist),
495				    tinfo->tidcnt, sizeof(tidinfo[0]));
496	if (IS_ERR(tidinfo))
497		return PTR_ERR(tidinfo);
498
499	mutex_lock(&uctxt->exp_mutex);
500	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
501		ret = unprogram_rcvarray(fd, tidinfo[tididx]);
502		if (ret) {
503			hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
504				  ret);
505			break;
506		}
507	}
508	spin_lock(&fd->tid_lock);
509	fd->tid_used -= tididx;
510	spin_unlock(&fd->tid_lock);
511	tinfo->tidcnt = tididx;
512	mutex_unlock(&uctxt->exp_mutex);
513
514	kfree(tidinfo);
515	return ret;
516}
517
518int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
519			      struct hfi1_tid_info *tinfo)
520{
521	struct hfi1_ctxtdata *uctxt = fd->uctxt;
522	unsigned long *ev = uctxt->dd->events +
523		(uctxt_offset(uctxt) + fd->subctxt);
524	u32 *array;
525	int ret = 0;
526
527	/*
528	 * copy_to_user() can sleep, which will leave the invalid_lock
529	 * locked and cause the MMU notifier to be blocked on the lock
530	 * for a long time.
531	 * Copy the data to a local buffer so we can release the lock.
532	 */
533	array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
534	if (!array)
535		return -EFAULT;
536
537	spin_lock(&fd->invalid_lock);
538	if (fd->invalid_tid_idx) {
539		memcpy(array, fd->invalid_tids, sizeof(*array) *
540		       fd->invalid_tid_idx);
541		memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
542		       fd->invalid_tid_idx);
543		tinfo->tidcnt = fd->invalid_tid_idx;
544		fd->invalid_tid_idx = 0;
545		/*
546		 * Reset the user flag while still holding the lock.
547		 * Otherwise, PSM can miss events.
548		 */
549		clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
550	} else {
551		tinfo->tidcnt = 0;
552	}
553	spin_unlock(&fd->invalid_lock);
554
555	if (tinfo->tidcnt) {
556		if (copy_to_user((void __user *)tinfo->tidlist,
557				 array, sizeof(*array) * tinfo->tidcnt))
558			ret = -EFAULT;
559	}
560	kfree(array);
561
562	return ret;
563}
564
565static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
566{
567	unsigned pagecount, pageidx, setcount = 0, i;
568	unsigned long pfn, this_pfn;
569	struct page **pages = tidbuf->pages;
570	struct tid_pageset *list = tidbuf->psets;
571
572	if (!npages)
573		return 0;
574
575	/*
576	 * Look for sets of physically contiguous pages in the user buffer.
577	 * This will allow us to optimize Expected RcvArray entry usage by
578	 * using the bigger supported sizes.
579	 */
580	pfn = page_to_pfn(pages[0]);
581	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
582		this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
583
584		/*
585		 * If the pfn's are not sequential, pages are not physically
586		 * contiguous.
587		 */
588		if (this_pfn != ++pfn) {
589			/*
590			 * At this point we have to loop over the set of
591			 * physically contiguous pages and break them down it
592			 * sizes supported by the HW.
593			 * There are two main constraints:
594			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
595			 *        If the total set size is bigger than that
596			 *        program only a MAX_EXPECTED_BUFFER chunk.
597			 *     2. The buffer size has to be a power of two. If
598			 *        it is not, round down to the closes power of
599			 *        2 and program that size.
600			 */
601			while (pagecount) {
602				int maxpages = pagecount;
603				u32 bufsize = pagecount * PAGE_SIZE;
604
605				if (bufsize > MAX_EXPECTED_BUFFER)
606					maxpages =
607						MAX_EXPECTED_BUFFER >>
608						PAGE_SHIFT;
609				else if (!is_power_of_2(bufsize))
610					maxpages =
611						rounddown_pow_of_two(bufsize) >>
612						PAGE_SHIFT;
613
614				list[setcount].idx = pageidx;
615				list[setcount].count = maxpages;
616				pagecount -= maxpages;
617				pageidx += maxpages;
618				setcount++;
619			}
620			pageidx = i;
621			pagecount = 1;
622			pfn = this_pfn;
623		} else {
624			pagecount++;
625		}
626	}
627	return setcount;
628}
629
630/**
631 * program_rcvarray() - program an RcvArray group with receive buffers
632 * @fd: filedata pointer
633 * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
634 *	  virtual address, buffer length, page pointers, pagesets (array of
635 *	  struct tid_pageset holding information on physically contiguous
636 *	  chunks from the user buffer), and other fields.
637 * @grp: RcvArray group
638 * @count: number of struct tid_pageset's to program
639 * @tidlist: the array of u32 elements when the information about the
640 *           programmed RcvArray entries is to be encoded.
641 * @tididx: starting offset into tidlist
642 * @pmapped: (output parameter) number of pages programmed into the RcvArray
643 *           entries.
644 *
645 * This function will program up to 'count' number of RcvArray entries from the
646 * group 'grp'. To make best use of write-combining writes, the function will
647 * perform writes to the unused RcvArray entries which will be ignored by the
648 * HW. Each RcvArray entry will be programmed with a physically contiguous
649 * buffer chunk from the user's virtual buffer.
650 *
651 * Return:
652 * -EINVAL if the requested count is larger than the size of the group,
653 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
654 * number of RcvArray entries programmed.
655 */
656static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
657			    struct tid_group *grp, u16 count,
658			    u32 *tidlist, unsigned int *tididx,
659			    unsigned int *pmapped)
660{
661	struct hfi1_ctxtdata *uctxt = fd->uctxt;
662	struct hfi1_devdata *dd = uctxt->dd;
663	u16 idx;
664	unsigned int start = *tididx;
665	u32 tidinfo = 0, rcventry, useidx = 0;
666	int mapped = 0;
667
668	/* Count should never be larger than the group size */
669	if (count > grp->size)
670		return -EINVAL;
671
672	/* Find the first unused entry in the group */
673	for (idx = 0; idx < grp->size; idx++) {
674		if (!(grp->map & (1 << idx))) {
675			useidx = idx;
676			break;
677		}
678		rcv_array_wc_fill(dd, grp->base + idx);
679	}
680
681	idx = 0;
682	while (idx < count) {
683		u16 npages, pageidx, setidx = start + idx;
684		int ret = 0;
685
686		/*
687		 * If this entry in the group is used, move to the next one.
688		 * If we go past the end of the group, exit the loop.
689		 */
690		if (useidx >= grp->size) {
691			break;
692		} else if (grp->map & (1 << useidx)) {
693			rcv_array_wc_fill(dd, grp->base + useidx);
694			useidx++;
695			continue;
696		}
697
698		rcventry = grp->base + useidx;
699		npages = tbuf->psets[setidx].count;
700		pageidx = tbuf->psets[setidx].idx;
701
702		ret = set_rcvarray_entry(fd, tbuf,
703					 rcventry, grp, pageidx,
704					 npages);
705		if (ret)
706			return ret;
707		mapped += npages;
708
709		tidinfo = create_tid(rcventry - uctxt->expected_base, npages);
710		tidlist[(*tididx)++] = tidinfo;
711		grp->used++;
712		grp->map |= 1 << useidx++;
713		idx++;
714	}
715
716	/* Fill the rest of the group with "blank" writes */
717	for (; useidx < grp->size; useidx++)
718		rcv_array_wc_fill(dd, grp->base + useidx);
719	*pmapped = mapped;
720	return idx;
721}
722
723static int set_rcvarray_entry(struct hfi1_filedata *fd,
724			      struct tid_user_buf *tbuf,
725			      u32 rcventry, struct tid_group *grp,
726			      u16 pageidx, unsigned int npages)
727{
728	int ret;
729	struct hfi1_ctxtdata *uctxt = fd->uctxt;
730	struct tid_rb_node *node;
731	struct hfi1_devdata *dd = uctxt->dd;
732	dma_addr_t phys;
733	struct page **pages = tbuf->pages + pageidx;
734
735	/*
736	 * Allocate the node first so we can handle a potential
737	 * failure before we've programmed anything.
738	 */
739	node = kzalloc(struct_size(node, pages, npages), GFP_KERNEL);
740	if (!node)
741		return -ENOMEM;
742
743	phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])),
744			      npages * PAGE_SIZE, DMA_FROM_DEVICE);
745	if (dma_mapping_error(&dd->pcidev->dev, phys)) {
746		dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
747			   phys);
748		kfree(node);
749		return -EFAULT;
750	}
751
752	node->fdata = fd;
753	mutex_init(&node->invalidate_mutex);
754	node->phys = page_to_phys(pages[0]);
755	node->npages = npages;
756	node->rcventry = rcventry;
757	node->dma_addr = phys;
758	node->grp = grp;
759	node->freed = false;
760	memcpy(node->pages, pages, flex_array_size(node, pages, npages));
761
762	if (fd->use_mn) {
763		ret = mmu_interval_notifier_insert(
764			&node->notifier, current->mm,
765			tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
766			&tid_mn_ops);
767		if (ret)
768			goto out_unmap;
769	}
770	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
771
772	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
773	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
774			       node->notifier.interval_tree.start, node->phys,
775			       phys);
776	return 0;
777
778out_unmap:
779	hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
780		  node->rcventry, node->notifier.interval_tree.start,
781		  node->phys, ret);
782	dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE,
783			 DMA_FROM_DEVICE);
784	kfree(node);
785	return -EFAULT;
786}
787
788static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
789{
790	struct hfi1_ctxtdata *uctxt = fd->uctxt;
791	struct hfi1_devdata *dd = uctxt->dd;
792	struct tid_rb_node *node;
793	u32 tidctrl = EXP_TID_GET(tidinfo, CTRL);
794	u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
795
796	if (tidctrl == 0x3 || tidctrl == 0x0)
797		return -EINVAL;
798
799	rcventry = tididx + (tidctrl - 1);
800
801	if (rcventry >= uctxt->expected_count) {
802		dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
803			   rcventry, uctxt->ctxt);
804		return -EINVAL;
805	}
806
807	node = fd->entry_to_rb[rcventry];
808	if (!node || node->rcventry != (uctxt->expected_base + rcventry))
809		return -EBADF;
810
811	if (fd->use_mn)
812		mmu_interval_notifier_remove(&node->notifier);
813	cacheless_tid_rb_remove(fd, node);
814
815	return 0;
816}
817
818static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
819{
820	struct hfi1_ctxtdata *uctxt = fd->uctxt;
821	struct hfi1_devdata *dd = uctxt->dd;
822
823	mutex_lock(&node->invalidate_mutex);
824	if (node->freed)
825		goto done;
826	node->freed = true;
827
828	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
829				 node->npages,
830				 node->notifier.interval_tree.start, node->phys,
831				 node->dma_addr);
832
833	/* Make sure device has seen the write before pages are unpinned */
834	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
835
836	unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
837done:
838	mutex_unlock(&node->invalidate_mutex);
839}
840
841static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
842{
843	struct hfi1_ctxtdata *uctxt = fd->uctxt;
844
845	__clear_tid_node(fd, node);
846
847	node->grp->used--;
848	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
849
850	if (node->grp->used == node->grp->size - 1)
851		tid_group_move(node->grp, &uctxt->tid_full_list,
852			       &uctxt->tid_used_list);
853	else if (!node->grp->used)
854		tid_group_move(node->grp, &uctxt->tid_used_list,
855			       &uctxt->tid_group_list);
856	kfree(node);
857}
858
859/*
860 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
861 * clearing nodes in the non-cached case.
862 */
863static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
864			    struct exp_tid_set *set,
865			    struct hfi1_filedata *fd)
866{
867	struct tid_group *grp, *ptr;
868	int i;
869
870	list_for_each_entry_safe(grp, ptr, &set->list, list) {
871		list_del_init(&grp->list);
872
873		for (i = 0; i < grp->size; i++) {
874			if (grp->map & (1 << i)) {
875				u16 rcventry = grp->base + i;
876				struct tid_rb_node *node;
877
878				node = fd->entry_to_rb[rcventry -
879							  uctxt->expected_base];
880				if (!node || node->rcventry != rcventry)
881					continue;
882
883				if (fd->use_mn)
884					mmu_interval_notifier_remove(
885						&node->notifier);
886				cacheless_tid_rb_remove(fd, node);
887			}
888		}
889	}
890}
891
892static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
893			      const struct mmu_notifier_range *range,
894			      unsigned long cur_seq)
895{
896	struct tid_rb_node *node =
897		container_of(mni, struct tid_rb_node, notifier);
898	struct hfi1_filedata *fdata = node->fdata;
899	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
900
901	if (node->freed)
902		return true;
903
904	/* take action only if unmapping */
905	if (range->event != MMU_NOTIFY_UNMAP)
906		return true;
907
908	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
909				 node->notifier.interval_tree.start,
910				 node->rcventry, node->npages, node->dma_addr);
911
912	/* clear the hardware rcvarray entry */
913	__clear_tid_node(fdata, node);
914
915	spin_lock(&fdata->invalid_lock);
916	if (fdata->invalid_tid_idx < uctxt->expected_count) {
917		fdata->invalid_tids[fdata->invalid_tid_idx] =
918			create_tid(node->rcventry - uctxt->expected_base,
919				   node->npages);
920		if (!fdata->invalid_tid_idx) {
921			unsigned long *ev;
922
923			/*
924			 * hfi1_set_uevent_bits() sets a user event flag
925			 * for all processes. Because calling into the
926			 * driver to process TID cache invalidations is
927			 * expensive and TID cache invalidations are
928			 * handled on a per-process basis, we can
929			 * optimize this to set the flag only for the
930			 * process in question.
931			 */
932			ev = uctxt->dd->events +
933				(uctxt_offset(uctxt) + fdata->subctxt);
934			set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
935		}
936		fdata->invalid_tid_idx++;
937	}
938	spin_unlock(&fdata->invalid_lock);
939	return true;
940}
941
942static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
943			         const struct mmu_notifier_range *range,
944			         unsigned long cur_seq)
945{
946	struct tid_user_buf *tidbuf =
947		container_of(mni, struct tid_user_buf, notifier);
948
949	/* take action only if unmapping */
950	if (range->event == MMU_NOTIFY_UNMAP) {
951		mutex_lock(&tidbuf->cover_mutex);
952		mmu_interval_set_seq(mni, cur_seq);
953		mutex_unlock(&tidbuf->cover_mutex);
954	}
955
956	return true;
957}
958
959static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
960				    struct tid_rb_node *tnode)
961{
962	u32 base = fdata->uctxt->expected_base;
963
964	fdata->entry_to_rb[tnode->rcventry - base] = NULL;
965	clear_tid_node(fdata, tnode);
966}
967