1/*
2 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 *
33 * $Id: fmr_pool.c,v 1.1.1.1 2007/10/11 23:31:50 Exp $
34 */
35
36#include <linux/errno.h>
37#include <linux/spinlock.h>
38#include <linux/slab.h>
39#include <linux/jhash.h>
40#include <linux/kthread.h>
41
42#include <rdma/ib_fmr_pool.h>
43
44#include "core_priv.h"
45
46#define PFX "fmr_pool: "
47
48enum {
49	IB_FMR_MAX_REMAPS = 32,
50
51	IB_FMR_HASH_BITS  = 8,
52	IB_FMR_HASH_SIZE  = 1 << IB_FMR_HASH_BITS,
53	IB_FMR_HASH_MASK  = IB_FMR_HASH_SIZE - 1
54};
55
56/*
57 * If an FMR is not in use, then the list member will point to either
58 * its pool's free_list (if the FMR can be mapped again; that is,
59 * remap_count < pool->max_remaps) or its pool's dirty_list (if the
60 * FMR needs to be unmapped before being remapped).  In either of
61 * these cases it is a bug if the ref_count is not 0.  In other words,
62 * if ref_count is > 0, then the list member must not be linked into
63 * either free_list or dirty_list.
64 *
65 * The cache_node member is used to link the FMR into a cache bucket
66 * (if caching is enabled).  This is independent of the reference
67 * count of the FMR.  When a valid FMR is released, its ref_count is
68 * decremented, and if ref_count reaches 0, the FMR is placed in
69 * either free_list or dirty_list as appropriate.  However, it is not
70 * removed from the cache and may be "revived" if a call to
71 * ib_fmr_register_physical() occurs before the FMR is remapped.  In
72 * this case we just increment the ref_count and remove the FMR from
73 * free_list/dirty_list.
74 *
75 * Before we remap an FMR from free_list, we remove it from the cache
76 * (to prevent another user from obtaining a stale FMR).  When an FMR
77 * is released, we add it to the tail of the free list, so that our
78 * cache eviction policy is "least recently used."
79 *
80 * All manipulation of ref_count, list and cache_node is protected by
81 * pool_lock to maintain consistency.
82 */
83
84struct ib_fmr_pool {
85	spinlock_t                pool_lock;
86
87	int                       pool_size;
88	int                       max_pages;
89	int			  max_remaps;
90	int                       dirty_watermark;
91	int                       dirty_len;
92	struct list_head          free_list;
93	struct list_head          dirty_list;
94	struct hlist_head        *cache_bucket;
95
96	void                     (*flush_function)(struct ib_fmr_pool *pool,
97						   void *              arg);
98	void                     *flush_arg;
99
100	struct task_struct       *thread;
101
102	atomic_t                  req_ser;
103	atomic_t                  flush_ser;
104
105	wait_queue_head_t         force_wait;
106};
107
108static inline u32 ib_fmr_hash(u64 first_page)
109{
110	return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
111		(IB_FMR_HASH_SIZE - 1);
112}
113
114/* Caller must hold pool_lock */
115static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
116						      u64 *page_list,
117						      int  page_list_len,
118						      u64  io_virtual_address)
119{
120	struct hlist_head *bucket;
121	struct ib_pool_fmr *fmr;
122	struct hlist_node *pos;
123
124	if (!pool->cache_bucket)
125		return NULL;
126
127	bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
128
129	hlist_for_each_entry(fmr, pos, bucket, cache_node)
130		if (io_virtual_address == fmr->io_virtual_address &&
131		    page_list_len      == fmr->page_list_len      &&
132		    !memcmp(page_list, fmr->page_list,
133			    page_list_len * sizeof *page_list))
134			return fmr;
135
136	return NULL;
137}
138
139static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
140{
141	int                 ret;
142	struct ib_pool_fmr *fmr;
143	LIST_HEAD(unmap_list);
144	LIST_HEAD(fmr_list);
145
146	spin_lock_irq(&pool->pool_lock);
147
148	list_for_each_entry(fmr, &pool->dirty_list, list) {
149		hlist_del_init(&fmr->cache_node);
150		fmr->remap_count = 0;
151		list_add_tail(&fmr->fmr->list, &fmr_list);
152
153#ifdef DEBUG
154		if (fmr->ref_count !=0) {
155			printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d",
156			       fmr, fmr->ref_count);
157		}
158#endif
159	}
160
161	list_splice(&pool->dirty_list, &unmap_list);
162	INIT_LIST_HEAD(&pool->dirty_list);
163	pool->dirty_len = 0;
164
165	spin_unlock_irq(&pool->pool_lock);
166
167	if (list_empty(&unmap_list)) {
168		return;
169	}
170
171	ret = ib_unmap_fmr(&fmr_list);
172	if (ret)
173		printk(KERN_WARNING PFX "ib_unmap_fmr returned %d", ret);
174
175	spin_lock_irq(&pool->pool_lock);
176	list_splice(&unmap_list, &pool->free_list);
177	spin_unlock_irq(&pool->pool_lock);
178}
179
180static int ib_fmr_cleanup_thread(void *pool_ptr)
181{
182	struct ib_fmr_pool *pool = pool_ptr;
183
184	do {
185		if (pool->dirty_len >= pool->dirty_watermark ||
186		    atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
187			ib_fmr_batch_release(pool);
188
189			atomic_inc(&pool->flush_ser);
190			wake_up_interruptible(&pool->force_wait);
191
192			if (pool->flush_function)
193				pool->flush_function(pool, pool->flush_arg);
194		}
195
196		set_current_state(TASK_INTERRUPTIBLE);
197		if (pool->dirty_len < pool->dirty_watermark &&
198		    atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
199		    !kthread_should_stop())
200			schedule();
201		__set_current_state(TASK_RUNNING);
202	} while (!kthread_should_stop());
203
204	return 0;
205}
206
207/**
208 * ib_create_fmr_pool - Create an FMR pool
209 * @pd:Protection domain for FMRs
210 * @params:FMR pool parameters
211 *
212 * Create a pool of FMRs.  Return value is pointer to new pool or
213 * error code if creation failed.
214 */
215struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
216				       struct ib_fmr_pool_param *params)
217{
218	struct ib_device   *device;
219	struct ib_fmr_pool *pool;
220	struct ib_device_attr *attr;
221	int i;
222	int ret;
223	int max_remaps;
224
225	if (!params)
226		return ERR_PTR(-EINVAL);
227
228	device = pd->device;
229	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
230	    !device->map_phys_fmr || !device->unmap_fmr) {
231		printk(KERN_INFO PFX "Device %s does not support FMRs\n",
232		       device->name);
233		return ERR_PTR(-ENOSYS);
234	}
235
236	attr = kmalloc(sizeof *attr, GFP_KERNEL);
237	if (!attr) {
238		printk(KERN_WARNING PFX "couldn't allocate device attr struct");
239		return ERR_PTR(-ENOMEM);
240	}
241
242	ret = ib_query_device(device, attr);
243	if (ret) {
244		printk(KERN_WARNING PFX "couldn't query device: %d", ret);
245		kfree(attr);
246		return ERR_PTR(ret);
247	}
248
249	if (!attr->max_map_per_fmr)
250		max_remaps = IB_FMR_MAX_REMAPS;
251	else
252		max_remaps = attr->max_map_per_fmr;
253
254	kfree(attr);
255
256	pool = kmalloc(sizeof *pool, GFP_KERNEL);
257	if (!pool) {
258		printk(KERN_WARNING PFX "couldn't allocate pool struct");
259		return ERR_PTR(-ENOMEM);
260	}
261
262	pool->cache_bucket   = NULL;
263
264	pool->flush_function = params->flush_function;
265	pool->flush_arg      = params->flush_arg;
266
267	INIT_LIST_HEAD(&pool->free_list);
268	INIT_LIST_HEAD(&pool->dirty_list);
269
270	if (params->cache) {
271		pool->cache_bucket =
272			kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
273				GFP_KERNEL);
274		if (!pool->cache_bucket) {
275			printk(KERN_WARNING PFX "Failed to allocate cache in pool");
276			ret = -ENOMEM;
277			goto out_free_pool;
278		}
279
280		for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
281			INIT_HLIST_HEAD(pool->cache_bucket + i);
282	}
283
284	pool->pool_size       = 0;
285	pool->max_pages       = params->max_pages_per_fmr;
286	pool->max_remaps      = max_remaps;
287	pool->dirty_watermark = params->dirty_watermark;
288	pool->dirty_len       = 0;
289	spin_lock_init(&pool->pool_lock);
290	atomic_set(&pool->req_ser,   0);
291	atomic_set(&pool->flush_ser, 0);
292	init_waitqueue_head(&pool->force_wait);
293
294	pool->thread = kthread_create(ib_fmr_cleanup_thread,
295				      pool,
296				      "ib_fmr(%s)",
297				      device->name);
298	if (IS_ERR(pool->thread)) {
299		printk(KERN_WARNING PFX "couldn't start cleanup thread");
300		ret = PTR_ERR(pool->thread);
301		goto out_free_pool;
302	}
303
304	{
305		struct ib_pool_fmr *fmr;
306		struct ib_fmr_attr fmr_attr = {
307			.max_pages  = params->max_pages_per_fmr,
308			.max_maps   = pool->max_remaps,
309			.page_shift = params->page_shift
310		};
311
312		for (i = 0; i < params->pool_size; ++i) {
313			fmr = kmalloc(sizeof *fmr + params->max_pages_per_fmr * sizeof (u64),
314				      GFP_KERNEL);
315			if (!fmr) {
316				printk(KERN_WARNING PFX "failed to allocate fmr "
317				       "struct for FMR %d", i);
318				goto out_fail;
319			}
320
321			fmr->pool             = pool;
322			fmr->remap_count      = 0;
323			fmr->ref_count        = 0;
324			INIT_HLIST_NODE(&fmr->cache_node);
325
326			fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
327			if (IS_ERR(fmr->fmr)) {
328				printk(KERN_WARNING PFX "fmr_create failed "
329				       "for FMR %d", i);
330				kfree(fmr);
331				goto out_fail;
332			}
333
334			list_add_tail(&fmr->list, &pool->free_list);
335			++pool->pool_size;
336		}
337	}
338
339	return pool;
340
341 out_free_pool:
342	kfree(pool->cache_bucket);
343	kfree(pool);
344
345	return ERR_PTR(ret);
346
347 out_fail:
348	ib_destroy_fmr_pool(pool);
349
350	return ERR_PTR(-ENOMEM);
351}
352EXPORT_SYMBOL(ib_create_fmr_pool);
353
354/**
355 * ib_destroy_fmr_pool - Free FMR pool
356 * @pool:FMR pool to free
357 *
358 * Destroy an FMR pool and free all associated resources.
359 */
360void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
361{
362	struct ib_pool_fmr *fmr;
363	struct ib_pool_fmr *tmp;
364	LIST_HEAD(fmr_list);
365	int                 i;
366
367	kthread_stop(pool->thread);
368	ib_fmr_batch_release(pool);
369
370	i = 0;
371	list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
372		if (fmr->remap_count) {
373			INIT_LIST_HEAD(&fmr_list);
374			list_add_tail(&fmr->fmr->list, &fmr_list);
375			ib_unmap_fmr(&fmr_list);
376		}
377		ib_dealloc_fmr(fmr->fmr);
378		list_del(&fmr->list);
379		kfree(fmr);
380		++i;
381	}
382
383	if (i < pool->pool_size)
384		printk(KERN_WARNING PFX "pool still has %d regions registered",
385		       pool->pool_size - i);
386
387	kfree(pool->cache_bucket);
388	kfree(pool);
389}
390EXPORT_SYMBOL(ib_destroy_fmr_pool);
391
392/**
393 * ib_flush_fmr_pool - Invalidate all unmapped FMRs
394 * @pool:FMR pool to flush
395 *
396 * Ensure that all unmapped FMRs are fully invalidated.
397 */
398int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
399{
400	int serial = atomic_inc_return(&pool->req_ser);
401
402	wake_up_process(pool->thread);
403
404	if (wait_event_interruptible(pool->force_wait,
405				     atomic_read(&pool->flush_ser) - serial >= 0))
406		return -EINTR;
407
408	return 0;
409}
410EXPORT_SYMBOL(ib_flush_fmr_pool);
411
412/**
413 * ib_fmr_pool_map_phys -
414 * @pool:FMR pool to allocate FMR from
415 * @page_list:List of pages to map
416 * @list_len:Number of pages in @page_list
417 * @io_virtual_address:I/O virtual address for new FMR
418 *
419 * Map an FMR from an FMR pool.
420 */
421struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
422					 u64                *page_list,
423					 int                 list_len,
424					 u64                 io_virtual_address)
425{
426	struct ib_fmr_pool *pool = pool_handle;
427	struct ib_pool_fmr *fmr;
428	unsigned long       flags;
429	int                 result;
430
431	if (list_len < 1 || list_len > pool->max_pages)
432		return ERR_PTR(-EINVAL);
433
434	spin_lock_irqsave(&pool->pool_lock, flags);
435	fmr = ib_fmr_cache_lookup(pool,
436				  page_list,
437				  list_len,
438				  io_virtual_address);
439	if (fmr) {
440		/* found in cache */
441		++fmr->ref_count;
442		if (fmr->ref_count == 1) {
443			list_del(&fmr->list);
444		}
445
446		spin_unlock_irqrestore(&pool->pool_lock, flags);
447
448		return fmr;
449	}
450
451	if (list_empty(&pool->free_list)) {
452		spin_unlock_irqrestore(&pool->pool_lock, flags);
453		return ERR_PTR(-EAGAIN);
454	}
455
456	fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
457	list_del(&fmr->list);
458	hlist_del_init(&fmr->cache_node);
459	spin_unlock_irqrestore(&pool->pool_lock, flags);
460
461	result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
462				 io_virtual_address);
463
464	if (result) {
465		spin_lock_irqsave(&pool->pool_lock, flags);
466		list_add(&fmr->list, &pool->free_list);
467		spin_unlock_irqrestore(&pool->pool_lock, flags);
468
469		printk(KERN_WARNING PFX "fmr_map returns %d\n", result);
470
471		return ERR_PTR(result);
472	}
473
474	++fmr->remap_count;
475	fmr->ref_count = 1;
476
477	if (pool->cache_bucket) {
478		fmr->io_virtual_address = io_virtual_address;
479		fmr->page_list_len      = list_len;
480		memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
481
482		spin_lock_irqsave(&pool->pool_lock, flags);
483		hlist_add_head(&fmr->cache_node,
484			       pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
485		spin_unlock_irqrestore(&pool->pool_lock, flags);
486	}
487
488	return fmr;
489}
490EXPORT_SYMBOL(ib_fmr_pool_map_phys);
491
492/**
493 * ib_fmr_pool_unmap - Unmap FMR
494 * @fmr:FMR to unmap
495 *
496 * Unmap an FMR.  The FMR mapping may remain valid until the FMR is
497 * reused (or until ib_flush_fmr_pool() is called).
498 */
499int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
500{
501	struct ib_fmr_pool *pool;
502	unsigned long flags;
503
504	pool = fmr->pool;
505
506	spin_lock_irqsave(&pool->pool_lock, flags);
507
508	--fmr->ref_count;
509	if (!fmr->ref_count) {
510		if (fmr->remap_count < pool->max_remaps) {
511			list_add_tail(&fmr->list, &pool->free_list);
512		} else {
513			list_add_tail(&fmr->list, &pool->dirty_list);
514			++pool->dirty_len;
515			wake_up_process(pool->thread);
516		}
517	}
518
519#ifdef DEBUG
520	if (fmr->ref_count < 0)
521		printk(KERN_WARNING PFX "FMR %p has ref count %d < 0",
522		       fmr, fmr->ref_count);
523#endif
524
525	spin_unlock_irqrestore(&pool->pool_lock, flags);
526
527	return 0;
528}
529EXPORT_SYMBOL(ib_fmr_pool_unmap);
530