1/******************************************************************************
2 *
3 * Back-end of the driver for virtual block devices. This portion of the
4 * driver exports a 'unified' block-device interface that can be accessed
5 * by any operating system that implements a compatible front end. A
6 * reference front-end implementation can be found in:
7 *  drivers/block/xen-blkfront.c
8 *
9 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
10 * Copyright (c) 2005, Christopher Clark
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation; or, when distributed
15 * separately from the Linux kernel or incorporated into other
16 * software packages, subject to the following license:
17 *
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this source file (the "Software"), to deal in the Software without
20 * restriction, including without limitation the rights to use, copy, modify,
21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
22 * and to permit persons to whom the Software is furnished to do so, subject to
23 * the following conditions:
24 *
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
34 * IN THE SOFTWARE.
35 */
36
37#define pr_fmt(fmt) "xen-blkback: " fmt
38
39#include <linux/spinlock.h>
40#include <linux/kthread.h>
41#include <linux/list.h>
42#include <linux/delay.h>
43#include <linux/freezer.h>
44#include <linux/bitmap.h>
45
46#include <xen/events.h>
47#include <xen/page.h>
48#include <xen/xen.h>
49#include <asm/xen/hypervisor.h>
50#include <asm/xen/hypercall.h>
51#include <xen/balloon.h>
52#include <xen/grant_table.h>
53#include "common.h"
54
55/*
56 * Maximum number of unused free pages to keep in the internal buffer.
57 * Setting this to a value too low will reduce memory used in each backend,
58 * but can have a performance penalty.
59 *
60 * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
61 * be set to a lower value that might degrade performance on some intensive
62 * IO workloads.
63 */
64
65static int max_buffer_pages = 1024;
66module_param_named(max_buffer_pages, max_buffer_pages, int, 0644);
67MODULE_PARM_DESC(max_buffer_pages,
68"Maximum number of free pages to keep in each block backend buffer");
69
70/*
71 * Maximum number of grants to map persistently in blkback. For maximum
72 * performance this should be the total numbers of grants that can be used
73 * to fill the ring, but since this might become too high, specially with
74 * the use of indirect descriptors, we set it to a value that provides good
75 * performance without using too much memory.
76 *
77 * When the list of persistent grants is full we clean it up using a LRU
78 * algorithm.
79 */
80
81static int max_pgrants = 1056;
82module_param_named(max_persistent_grants, max_pgrants, int, 0644);
83MODULE_PARM_DESC(max_persistent_grants,
84                 "Maximum number of grants to map persistently");
85
86/*
87 * How long a persistent grant is allowed to remain allocated without being in
88 * use. The time is in seconds, 0 means indefinitely long.
89 */
90
91static unsigned int pgrant_timeout = 60;
92module_param_named(persistent_grant_unused_seconds, pgrant_timeout,
93		   uint, 0644);
94MODULE_PARM_DESC(persistent_grant_unused_seconds,
95		 "Time in seconds an unused persistent grant is allowed to "
96		 "remain allocated. Default is 60, 0 means unlimited.");
97
98/*
99 * Maximum number of rings/queues blkback supports, allow as many queues as there
100 * are CPUs if user has not specified a value.
101 */
102unsigned int xenblk_max_queues;
103module_param_named(max_queues, xenblk_max_queues, uint, 0644);
104MODULE_PARM_DESC(max_queues,
105		 "Maximum number of hardware queues per virtual disk." \
106		 "By default it is the number of online CPUs.");
107
108/*
109 * Maximum order of pages to be used for the shared ring between front and
110 * backend, 4KB page granularity is used.
111 */
112unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
113module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
114MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
115/*
116 * The LRU mechanism to clean the lists of persistent grants needs to
117 * be executed periodically. The time interval between consecutive executions
118 * of the purge mechanism is set in ms.
119 */
120#define LRU_INTERVAL 100
121
122/*
123 * When the persistent grants list is full we will remove unused grants
124 * from the list. The percent number of grants to be removed at each LRU
125 * execution.
126 */
127#define LRU_PERCENT_CLEAN 5
128
129/* Run-time switchable: /sys/module/blkback/parameters/ */
130static unsigned int log_stats;
131module_param(log_stats, int, 0644);
132
133#define BLKBACK_INVALID_HANDLE (~0)
134
135static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt)
136{
137	return pgrant_timeout && (jiffies - persistent_gnt->last_used >=
138			HZ * pgrant_timeout);
139}
140
141#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
142
143static int do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags);
144static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
145				struct blkif_request *req,
146				struct pending_req *pending_req);
147static void make_response(struct xen_blkif_ring *ring, u64 id,
148			  unsigned short op, int st);
149
150#define foreach_grant_safe(pos, n, rbtree, node) \
151	for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
152	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
153	     &(pos)->node != NULL; \
154	     (pos) = container_of(n, typeof(*(pos)), node), \
155	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
156
157
158/*
159 * We don't need locking around the persistent grant helpers
160 * because blkback uses a single-thread for each backend, so we
161 * can be sure that this functions will never be called recursively.
162 *
163 * The only exception to that is put_persistent_grant, that can be called
164 * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
165 * bit operations to modify the flags of a persistent grant and to count
166 * the number of used grants.
167 */
168static int add_persistent_gnt(struct xen_blkif_ring *ring,
169			       struct persistent_gnt *persistent_gnt)
170{
171	struct rb_node **new = NULL, *parent = NULL;
172	struct persistent_gnt *this;
173	struct xen_blkif *blkif = ring->blkif;
174
175	if (ring->persistent_gnt_c >= max_pgrants) {
176		if (!blkif->vbd.overflow_max_grants)
177			blkif->vbd.overflow_max_grants = 1;
178		return -EBUSY;
179	}
180	/* Figure out where to put new node */
181	new = &ring->persistent_gnts.rb_node;
182	while (*new) {
183		this = container_of(*new, struct persistent_gnt, node);
184
185		parent = *new;
186		if (persistent_gnt->gnt < this->gnt)
187			new = &((*new)->rb_left);
188		else if (persistent_gnt->gnt > this->gnt)
189			new = &((*new)->rb_right);
190		else {
191			pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
192			return -EINVAL;
193		}
194	}
195
196	persistent_gnt->active = true;
197	/* Add new node and rebalance tree. */
198	rb_link_node(&(persistent_gnt->node), parent, new);
199	rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
200	ring->persistent_gnt_c++;
201	atomic_inc(&ring->persistent_gnt_in_use);
202	return 0;
203}
204
205static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
206						 grant_ref_t gref)
207{
208	struct persistent_gnt *data;
209	struct rb_node *node = NULL;
210
211	node = ring->persistent_gnts.rb_node;
212	while (node) {
213		data = container_of(node, struct persistent_gnt, node);
214
215		if (gref < data->gnt)
216			node = node->rb_left;
217		else if (gref > data->gnt)
218			node = node->rb_right;
219		else {
220			if (data->active) {
221				pr_alert_ratelimited("requesting a grant already in use\n");
222				return NULL;
223			}
224			data->active = true;
225			atomic_inc(&ring->persistent_gnt_in_use);
226			return data;
227		}
228	}
229	return NULL;
230}
231
232static void put_persistent_gnt(struct xen_blkif_ring *ring,
233                               struct persistent_gnt *persistent_gnt)
234{
235	if (!persistent_gnt->active)
236		pr_alert_ratelimited("freeing a grant already unused\n");
237	persistent_gnt->last_used = jiffies;
238	persistent_gnt->active = false;
239	atomic_dec(&ring->persistent_gnt_in_use);
240}
241
242static void free_persistent_gnts(struct xen_blkif_ring *ring)
243{
244	struct rb_root *root = &ring->persistent_gnts;
245	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
246	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
247	struct persistent_gnt *persistent_gnt;
248	struct rb_node *n;
249	int segs_to_unmap = 0;
250	struct gntab_unmap_queue_data unmap_data;
251
252	if (RB_EMPTY_ROOT(root))
253		return;
254
255	unmap_data.pages = pages;
256	unmap_data.unmap_ops = unmap;
257	unmap_data.kunmap_ops = NULL;
258
259	foreach_grant_safe(persistent_gnt, n, root, node) {
260		BUG_ON(persistent_gnt->handle ==
261			BLKBACK_INVALID_HANDLE);
262		gnttab_set_unmap_op(&unmap[segs_to_unmap],
263			(unsigned long) pfn_to_kaddr(page_to_pfn(
264				persistent_gnt->page)),
265			GNTMAP_host_map,
266			persistent_gnt->handle);
267
268		pages[segs_to_unmap] = persistent_gnt->page;
269
270		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
271			!rb_next(&persistent_gnt->node)) {
272
273			unmap_data.count = segs_to_unmap;
274			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
275
276			gnttab_page_cache_put(&ring->free_pages, pages,
277					      segs_to_unmap);
278			segs_to_unmap = 0;
279		}
280
281		rb_erase(&persistent_gnt->node, root);
282		kfree(persistent_gnt);
283		ring->persistent_gnt_c--;
284	}
285
286	BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
287	BUG_ON(ring->persistent_gnt_c != 0);
288}
289
290void xen_blkbk_unmap_purged_grants(struct work_struct *work)
291{
292	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
293	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
294	struct persistent_gnt *persistent_gnt;
295	int segs_to_unmap = 0;
296	struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
297	struct gntab_unmap_queue_data unmap_data;
298
299	unmap_data.pages = pages;
300	unmap_data.unmap_ops = unmap;
301	unmap_data.kunmap_ops = NULL;
302
303	while(!list_empty(&ring->persistent_purge_list)) {
304		persistent_gnt = list_first_entry(&ring->persistent_purge_list,
305		                                  struct persistent_gnt,
306		                                  remove_node);
307		list_del(&persistent_gnt->remove_node);
308
309		gnttab_set_unmap_op(&unmap[segs_to_unmap],
310			vaddr(persistent_gnt->page),
311			GNTMAP_host_map,
312			persistent_gnt->handle);
313
314		pages[segs_to_unmap] = persistent_gnt->page;
315
316		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
317			unmap_data.count = segs_to_unmap;
318			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
319			gnttab_page_cache_put(&ring->free_pages, pages,
320					      segs_to_unmap);
321			segs_to_unmap = 0;
322		}
323		kfree(persistent_gnt);
324	}
325	if (segs_to_unmap > 0) {
326		unmap_data.count = segs_to_unmap;
327		BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
328		gnttab_page_cache_put(&ring->free_pages, pages, segs_to_unmap);
329	}
330}
331
332static void purge_persistent_gnt(struct xen_blkif_ring *ring)
333{
334	struct persistent_gnt *persistent_gnt;
335	struct rb_node *n;
336	unsigned int num_clean, total;
337	bool scan_used = false;
338	struct rb_root *root;
339
340	if (work_busy(&ring->persistent_purge_work)) {
341		pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
342		goto out;
343	}
344
345	if (ring->persistent_gnt_c < max_pgrants ||
346	    (ring->persistent_gnt_c == max_pgrants &&
347	    !ring->blkif->vbd.overflow_max_grants)) {
348		num_clean = 0;
349	} else {
350		num_clean = (max_pgrants / 100) * LRU_PERCENT_CLEAN;
351		num_clean = ring->persistent_gnt_c - max_pgrants + num_clean;
352		num_clean = min(ring->persistent_gnt_c, num_clean);
353		pr_debug("Going to purge at least %u persistent grants\n",
354			 num_clean);
355	}
356
357	/*
358	 * At this point, we can assure that there will be no calls
359         * to get_persistent_grant (because we are executing this code from
360         * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
361         * which means that the number of currently used grants will go down,
362         * but never up, so we will always be able to remove the requested
363         * number of grants.
364	 */
365
366	total = 0;
367
368	BUG_ON(!list_empty(&ring->persistent_purge_list));
369	root = &ring->persistent_gnts;
370purge_list:
371	foreach_grant_safe(persistent_gnt, n, root, node) {
372		BUG_ON(persistent_gnt->handle ==
373			BLKBACK_INVALID_HANDLE);
374
375		if (persistent_gnt->active)
376			continue;
377		if (!scan_used && !persistent_gnt_timeout(persistent_gnt))
378			continue;
379		if (scan_used && total >= num_clean)
380			continue;
381
382		rb_erase(&persistent_gnt->node, root);
383		list_add(&persistent_gnt->remove_node,
384			 &ring->persistent_purge_list);
385		total++;
386	}
387	/*
388	 * Check whether we also need to start cleaning
389	 * grants that were used since last purge in order to cope
390	 * with the requested num
391	 */
392	if (!scan_used && total < num_clean) {
393		pr_debug("Still missing %u purged frames\n", num_clean - total);
394		scan_used = true;
395		goto purge_list;
396	}
397
398	if (total) {
399		ring->persistent_gnt_c -= total;
400		ring->blkif->vbd.overflow_max_grants = 0;
401
402		/* We can defer this work */
403		schedule_work(&ring->persistent_purge_work);
404		pr_debug("Purged %u/%u\n", num_clean, total);
405	}
406
407out:
408	return;
409}
410
411/*
412 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
413 */
414static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
415{
416	struct pending_req *req = NULL;
417	unsigned long flags;
418
419	spin_lock_irqsave(&ring->pending_free_lock, flags);
420	if (!list_empty(&ring->pending_free)) {
421		req = list_entry(ring->pending_free.next, struct pending_req,
422				 free_list);
423		list_del(&req->free_list);
424	}
425	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
426	return req;
427}
428
429/*
430 * Return the 'pending_req' structure back to the freepool. We also
431 * wake up the thread if it was waiting for a free page.
432 */
433static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
434{
435	unsigned long flags;
436	int was_empty;
437
438	spin_lock_irqsave(&ring->pending_free_lock, flags);
439	was_empty = list_empty(&ring->pending_free);
440	list_add(&req->free_list, &ring->pending_free);
441	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
442	if (was_empty)
443		wake_up(&ring->pending_free_wq);
444}
445
446/*
447 * Routines for managing virtual block devices (vbds).
448 */
449static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
450			     enum req_op operation)
451{
452	struct xen_vbd *vbd = &blkif->vbd;
453	int rc = -EACCES;
454
455	if ((operation != REQ_OP_READ) && vbd->readonly)
456		goto out;
457
458	if (likely(req->nr_sects)) {
459		blkif_sector_t end = req->sector_number + req->nr_sects;
460
461		if (unlikely(end < req->sector_number))
462			goto out;
463		if (unlikely(end > vbd_sz(vbd)))
464			goto out;
465	}
466
467	req->dev  = vbd->pdevice;
468	req->bdev = file_bdev(vbd->bdev_file);
469	rc = 0;
470
471 out:
472	return rc;
473}
474
475static void xen_vbd_resize(struct xen_blkif *blkif)
476{
477	struct xen_vbd *vbd = &blkif->vbd;
478	struct xenbus_transaction xbt;
479	int err;
480	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
481	unsigned long long new_size = vbd_sz(vbd);
482
483	pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
484		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
485	pr_info("VBD Resize: new size %llu\n", new_size);
486	vbd->size = new_size;
487again:
488	err = xenbus_transaction_start(&xbt);
489	if (err) {
490		pr_warn("Error starting transaction\n");
491		return;
492	}
493	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
494			    (unsigned long long)vbd_sz(vbd));
495	if (err) {
496		pr_warn("Error writing new size\n");
497		goto abort;
498	}
499	/*
500	 * Write the current state; we will use this to synchronize
501	 * the front-end. If the current state is "connected" the
502	 * front-end will get the new size information online.
503	 */
504	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
505	if (err) {
506		pr_warn("Error writing the state\n");
507		goto abort;
508	}
509
510	err = xenbus_transaction_end(xbt, 0);
511	if (err == -EAGAIN)
512		goto again;
513	if (err)
514		pr_warn("Error ending transaction\n");
515	return;
516abort:
517	xenbus_transaction_end(xbt, 1);
518}
519
520/*
521 * Notification from the guest OS.
522 */
523static void blkif_notify_work(struct xen_blkif_ring *ring)
524{
525	ring->waiting_reqs = 1;
526	wake_up(&ring->wq);
527}
528
529irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
530{
531	blkif_notify_work(dev_id);
532	return IRQ_HANDLED;
533}
534
535/*
536 * SCHEDULER FUNCTIONS
537 */
538
539static void print_stats(struct xen_blkif_ring *ring)
540{
541	pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
542		 "  |  ds %4llu | pg: %4u/%4d\n",
543		 current->comm, ring->st_oo_req,
544		 ring->st_rd_req, ring->st_wr_req,
545		 ring->st_f_req, ring->st_ds_req,
546		 ring->persistent_gnt_c, max_pgrants);
547	ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
548	ring->st_rd_req = 0;
549	ring->st_wr_req = 0;
550	ring->st_oo_req = 0;
551	ring->st_ds_req = 0;
552}
553
554int xen_blkif_schedule(void *arg)
555{
556	struct xen_blkif_ring *ring = arg;
557	struct xen_blkif *blkif = ring->blkif;
558	struct xen_vbd *vbd = &blkif->vbd;
559	unsigned long timeout;
560	int ret;
561	bool do_eoi;
562	unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
563
564	set_freezable();
565	while (!kthread_should_stop()) {
566		if (try_to_freeze())
567			continue;
568		if (unlikely(vbd->size != vbd_sz(vbd)))
569			xen_vbd_resize(blkif);
570
571		timeout = msecs_to_jiffies(LRU_INTERVAL);
572
573		timeout = wait_event_interruptible_timeout(
574			ring->wq,
575			ring->waiting_reqs || kthread_should_stop(),
576			timeout);
577		if (timeout == 0)
578			goto purge_gnt_list;
579		timeout = wait_event_interruptible_timeout(
580			ring->pending_free_wq,
581			!list_empty(&ring->pending_free) ||
582			kthread_should_stop(),
583			timeout);
584		if (timeout == 0)
585			goto purge_gnt_list;
586
587		do_eoi = ring->waiting_reqs;
588
589		ring->waiting_reqs = 0;
590		smp_mb(); /* clear flag *before* checking for work */
591
592		ret = do_block_io_op(ring, &eoi_flags);
593		if (ret > 0)
594			ring->waiting_reqs = 1;
595		if (ret == -EACCES)
596			wait_event_interruptible(ring->shutdown_wq,
597						 kthread_should_stop());
598
599		if (do_eoi && !ring->waiting_reqs) {
600			xen_irq_lateeoi(ring->irq, eoi_flags);
601			eoi_flags |= XEN_EOI_FLAG_SPURIOUS;
602		}
603
604purge_gnt_list:
605		if (blkif->vbd.feature_gnt_persistent &&
606		    time_after(jiffies, ring->next_lru)) {
607			purge_persistent_gnt(ring);
608			ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
609		}
610
611		/* Shrink the free pages pool if it is too large. */
612		if (time_before(jiffies, blkif->buffer_squeeze_end))
613			gnttab_page_cache_shrink(&ring->free_pages, 0);
614		else
615			gnttab_page_cache_shrink(&ring->free_pages,
616						 max_buffer_pages);
617
618		if (log_stats && time_after(jiffies, ring->st_print))
619			print_stats(ring);
620	}
621
622	/* Drain pending purge work */
623	flush_work(&ring->persistent_purge_work);
624
625	if (log_stats)
626		print_stats(ring);
627
628	ring->xenblkd = NULL;
629
630	return 0;
631}
632
633/*
634 * Remove persistent grants and empty the pool of free pages
635 */
636void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
637{
638	/* Free all persistent grant pages */
639	free_persistent_gnts(ring);
640
641	/* Since we are shutting down remove all pages from the buffer */
642	gnttab_page_cache_shrink(&ring->free_pages, 0 /* All */);
643}
644
645static unsigned int xen_blkbk_unmap_prepare(
646	struct xen_blkif_ring *ring,
647	struct grant_page **pages,
648	unsigned int num,
649	struct gnttab_unmap_grant_ref *unmap_ops,
650	struct page **unmap_pages)
651{
652	unsigned int i, invcount = 0;
653
654	for (i = 0; i < num; i++) {
655		if (pages[i]->persistent_gnt != NULL) {
656			put_persistent_gnt(ring, pages[i]->persistent_gnt);
657			continue;
658		}
659		if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
660			continue;
661		unmap_pages[invcount] = pages[i]->page;
662		gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
663				    GNTMAP_host_map, pages[i]->handle);
664		pages[i]->handle = BLKBACK_INVALID_HANDLE;
665		invcount++;
666	}
667
668	return invcount;
669}
670
671static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
672{
673	struct pending_req *pending_req = (struct pending_req *)(data->data);
674	struct xen_blkif_ring *ring = pending_req->ring;
675	struct xen_blkif *blkif = ring->blkif;
676
677	/* BUG_ON used to reproduce existing behaviour,
678	   but is this the best way to deal with this? */
679	BUG_ON(result);
680
681	gnttab_page_cache_put(&ring->free_pages, data->pages, data->count);
682	make_response(ring, pending_req->id,
683		      pending_req->operation, pending_req->status);
684	free_req(ring, pending_req);
685	/*
686	 * Make sure the request is freed before releasing blkif,
687	 * or there could be a race between free_req and the
688	 * cleanup done in xen_blkif_free during shutdown.
689	 *
690	 * NB: The fact that we might try to wake up pending_free_wq
691	 * before drain_complete (in case there's a drain going on)
692	 * it's not a problem with our current implementation
693	 * because we can assure there's no thread waiting on
694	 * pending_free_wq if there's a drain going on, but it has
695	 * to be taken into account if the current model is changed.
696	 */
697	if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
698		complete(&blkif->drain_complete);
699	}
700	xen_blkif_put(blkif);
701}
702
703static void xen_blkbk_unmap_and_respond(struct pending_req *req)
704{
705	struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
706	struct xen_blkif_ring *ring = req->ring;
707	struct grant_page **pages = req->segments;
708	unsigned int invcount;
709
710	invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
711					   req->unmap, req->unmap_pages);
712
713	work->data = req;
714	work->done = xen_blkbk_unmap_and_respond_callback;
715	work->unmap_ops = req->unmap;
716	work->kunmap_ops = NULL;
717	work->pages = req->unmap_pages;
718	work->count = invcount;
719
720	gnttab_unmap_refs_async(&req->gnttab_unmap_data);
721}
722
723
724/*
725 * Unmap the grant references.
726 *
727 * This could accumulate ops up to the batch size to reduce the number
728 * of hypercalls, but since this is only used in error paths there's
729 * no real need.
730 */
731static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
732                            struct grant_page *pages[],
733                            int num)
734{
735	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
736	struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
737	unsigned int invcount = 0;
738	int ret;
739
740	while (num) {
741		unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
742
743		invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
744						   unmap, unmap_pages);
745		if (invcount) {
746			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
747			BUG_ON(ret);
748			gnttab_page_cache_put(&ring->free_pages, unmap_pages,
749					      invcount);
750		}
751		pages += batch;
752		num -= batch;
753	}
754}
755
756static int xen_blkbk_map(struct xen_blkif_ring *ring,
757			 struct grant_page *pages[],
758			 int num, bool ro)
759{
760	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
761	struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
762	struct persistent_gnt *persistent_gnt = NULL;
763	phys_addr_t addr = 0;
764	int i, seg_idx, new_map_idx;
765	int segs_to_map = 0;
766	int ret = 0;
767	int last_map = 0, map_until = 0;
768	int use_persistent_gnts;
769	struct xen_blkif *blkif = ring->blkif;
770
771	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
772
773	/*
774	 * Fill out preq.nr_sects with proper amount of sectors, and setup
775	 * assign map[..] with the PFN of the page in our domain with the
776	 * corresponding grant reference for each page.
777	 */
778again:
779	for (i = map_until; i < num; i++) {
780		uint32_t flags;
781
782		if (use_persistent_gnts) {
783			persistent_gnt = get_persistent_gnt(
784				ring,
785				pages[i]->gref);
786		}
787
788		if (persistent_gnt) {
789			/*
790			 * We are using persistent grants and
791			 * the grant is already mapped
792			 */
793			pages[i]->page = persistent_gnt->page;
794			pages[i]->persistent_gnt = persistent_gnt;
795		} else {
796			if (gnttab_page_cache_get(&ring->free_pages,
797						  &pages[i]->page)) {
798				gnttab_page_cache_put(&ring->free_pages,
799						      pages_to_gnt,
800						      segs_to_map);
801				ret = -ENOMEM;
802				goto out;
803			}
804			addr = vaddr(pages[i]->page);
805			pages_to_gnt[segs_to_map] = pages[i]->page;
806			pages[i]->persistent_gnt = NULL;
807			flags = GNTMAP_host_map;
808			if (!use_persistent_gnts && ro)
809				flags |= GNTMAP_readonly;
810			gnttab_set_map_op(&map[segs_to_map++], addr,
811					  flags, pages[i]->gref,
812					  blkif->domid);
813		}
814		map_until = i + 1;
815		if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
816			break;
817	}
818
819	if (segs_to_map)
820		ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
821
822	/*
823	 * Now swizzle the MFN in our domain with the MFN from the other domain
824	 * so that when we access vaddr(pending_req,i) it has the contents of
825	 * the page from the other domain.
826	 */
827	for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
828		if (!pages[seg_idx]->persistent_gnt) {
829			/* This is a newly mapped grant */
830			BUG_ON(new_map_idx >= segs_to_map);
831			if (unlikely(map[new_map_idx].status != 0)) {
832				pr_debug("invalid buffer -- could not remap it\n");
833				gnttab_page_cache_put(&ring->free_pages,
834						      &pages[seg_idx]->page, 1);
835				pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
836				ret |= !ret;
837				goto next;
838			}
839			pages[seg_idx]->handle = map[new_map_idx].handle;
840		} else {
841			continue;
842		}
843		if (use_persistent_gnts &&
844		    ring->persistent_gnt_c < max_pgrants) {
845			/*
846			 * We are using persistent grants, the grant is
847			 * not mapped but we might have room for it.
848			 */
849			persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
850				                 GFP_KERNEL);
851			if (!persistent_gnt) {
852				/*
853				 * If we don't have enough memory to
854				 * allocate the persistent_gnt struct
855				 * map this grant non-persistenly
856				 */
857				goto next;
858			}
859			persistent_gnt->gnt = map[new_map_idx].ref;
860			persistent_gnt->handle = map[new_map_idx].handle;
861			persistent_gnt->page = pages[seg_idx]->page;
862			if (add_persistent_gnt(ring,
863			                       persistent_gnt)) {
864				kfree(persistent_gnt);
865				persistent_gnt = NULL;
866				goto next;
867			}
868			pages[seg_idx]->persistent_gnt = persistent_gnt;
869			pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
870				 persistent_gnt->gnt, ring->persistent_gnt_c,
871				 max_pgrants);
872			goto next;
873		}
874		if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
875			blkif->vbd.overflow_max_grants = 1;
876			pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
877			         blkif->domid, blkif->vbd.handle);
878		}
879		/*
880		 * We could not map this grant persistently, so use it as
881		 * a non-persistent grant.
882		 */
883next:
884		new_map_idx++;
885	}
886	segs_to_map = 0;
887	last_map = map_until;
888	if (!ret && map_until != num)
889		goto again;
890
891out:
892	for (i = last_map; i < num; i++) {
893		/* Don't zap current batch's valid persistent grants. */
894		if (i >= map_until)
895			pages[i]->persistent_gnt = NULL;
896		pages[i]->handle = BLKBACK_INVALID_HANDLE;
897	}
898
899	return ret;
900}
901
902static int xen_blkbk_map_seg(struct pending_req *pending_req)
903{
904	int rc;
905
906	rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
907			   pending_req->nr_segs,
908	                   (pending_req->operation != BLKIF_OP_READ));
909
910	return rc;
911}
912
913static int xen_blkbk_parse_indirect(struct blkif_request *req,
914				    struct pending_req *pending_req,
915				    struct seg_buf seg[],
916				    struct phys_req *preq)
917{
918	struct grant_page **pages = pending_req->indirect_pages;
919	struct xen_blkif_ring *ring = pending_req->ring;
920	int indirect_grefs, rc, n, nseg, i;
921	struct blkif_request_segment *segments = NULL;
922
923	nseg = pending_req->nr_segs;
924	indirect_grefs = INDIRECT_PAGES(nseg);
925	BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
926
927	for (i = 0; i < indirect_grefs; i++)
928		pages[i]->gref = req->u.indirect.indirect_grefs[i];
929
930	rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
931	if (rc)
932		goto unmap;
933
934	for (n = 0; n < nseg; n++) {
935		uint8_t first_sect, last_sect;
936
937		if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
938			/* Map indirect segments */
939			if (segments)
940				kunmap_atomic(segments);
941			segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
942		}
943		i = n % SEGS_PER_INDIRECT_FRAME;
944
945		pending_req->segments[n]->gref = segments[i].gref;
946
947		first_sect = READ_ONCE(segments[i].first_sect);
948		last_sect = READ_ONCE(segments[i].last_sect);
949		if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
950			rc = -EINVAL;
951			goto unmap;
952		}
953
954		seg[n].nsec = last_sect - first_sect + 1;
955		seg[n].offset = first_sect << 9;
956		preq->nr_sects += seg[n].nsec;
957	}
958
959unmap:
960	if (segments)
961		kunmap_atomic(segments);
962	xen_blkbk_unmap(ring, pages, indirect_grefs);
963	return rc;
964}
965
966static int dispatch_discard_io(struct xen_blkif_ring *ring,
967				struct blkif_request *req)
968{
969	int err = 0;
970	int status = BLKIF_RSP_OKAY;
971	struct xen_blkif *blkif = ring->blkif;
972	struct block_device *bdev = file_bdev(blkif->vbd.bdev_file);
973	struct phys_req preq;
974
975	xen_blkif_get(blkif);
976
977	preq.sector_number = req->u.discard.sector_number;
978	preq.nr_sects      = req->u.discard.nr_sectors;
979
980	err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE);
981	if (err) {
982		pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
983			preq.sector_number,
984			preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
985		goto fail_response;
986	}
987	ring->st_ds_req++;
988
989	if (blkif->vbd.discard_secure &&
990	    (req->u.discard.flag & BLKIF_DISCARD_SECURE))
991		err = blkdev_issue_secure_erase(bdev,
992				req->u.discard.sector_number,
993				req->u.discard.nr_sectors, GFP_KERNEL);
994	else
995		err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
996				req->u.discard.nr_sectors, GFP_KERNEL);
997
998fail_response:
999	if (err == -EOPNOTSUPP) {
1000		pr_debug("discard op failed, not supported\n");
1001		status = BLKIF_RSP_EOPNOTSUPP;
1002	} else if (err)
1003		status = BLKIF_RSP_ERROR;
1004
1005	make_response(ring, req->u.discard.id, req->operation, status);
1006	xen_blkif_put(blkif);
1007	return err;
1008}
1009
1010static int dispatch_other_io(struct xen_blkif_ring *ring,
1011			     struct blkif_request *req,
1012			     struct pending_req *pending_req)
1013{
1014	free_req(ring, pending_req);
1015	make_response(ring, req->u.other.id, req->operation,
1016		      BLKIF_RSP_EOPNOTSUPP);
1017	return -EIO;
1018}
1019
1020static void xen_blk_drain_io(struct xen_blkif_ring *ring)
1021{
1022	struct xen_blkif *blkif = ring->blkif;
1023
1024	atomic_set(&blkif->drain, 1);
1025	do {
1026		if (atomic_read(&ring->inflight) == 0)
1027			break;
1028		wait_for_completion_interruptible_timeout(
1029				&blkif->drain_complete, HZ);
1030
1031		if (!atomic_read(&blkif->drain))
1032			break;
1033	} while (!kthread_should_stop());
1034	atomic_set(&blkif->drain, 0);
1035}
1036
1037static void __end_block_io_op(struct pending_req *pending_req,
1038		blk_status_t error)
1039{
1040	/* An error fails the entire request. */
1041	if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE &&
1042	    error == BLK_STS_NOTSUPP) {
1043		pr_debug("flush diskcache op failed, not supported\n");
1044		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
1045		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1046	} else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER &&
1047		   error == BLK_STS_NOTSUPP) {
1048		pr_debug("write barrier op failed, not supported\n");
1049		xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
1050		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1051	} else if (error) {
1052		pr_debug("Buffer not up-to-date at end of operation,"
1053			 " error=%d\n", error);
1054		pending_req->status = BLKIF_RSP_ERROR;
1055	}
1056
1057	/*
1058	 * If all of the bio's have completed it is time to unmap
1059	 * the grant references associated with 'request' and provide
1060	 * the proper response on the ring.
1061	 */
1062	if (atomic_dec_and_test(&pending_req->pendcnt))
1063		xen_blkbk_unmap_and_respond(pending_req);
1064}
1065
1066/*
1067 * bio callback.
1068 */
1069static void end_block_io_op(struct bio *bio)
1070{
1071	__end_block_io_op(bio->bi_private, bio->bi_status);
1072	bio_put(bio);
1073}
1074
1075static void blkif_get_x86_32_req(struct blkif_request *dst,
1076				 const struct blkif_x86_32_request *src)
1077{
1078	unsigned int i, n;
1079
1080	dst->operation = READ_ONCE(src->operation);
1081
1082	switch (dst->operation) {
1083	case BLKIF_OP_READ:
1084	case BLKIF_OP_WRITE:
1085	case BLKIF_OP_WRITE_BARRIER:
1086	case BLKIF_OP_FLUSH_DISKCACHE:
1087		dst->u.rw.nr_segments = READ_ONCE(src->u.rw.nr_segments);
1088		dst->u.rw.handle = src->u.rw.handle;
1089		dst->u.rw.id = src->u.rw.id;
1090		dst->u.rw.sector_number = src->u.rw.sector_number;
1091		n = min_t(unsigned int, BLKIF_MAX_SEGMENTS_PER_REQUEST,
1092			  dst->u.rw.nr_segments);
1093		for (i = 0; i < n; i++)
1094			dst->u.rw.seg[i] = src->u.rw.seg[i];
1095		break;
1096
1097	case BLKIF_OP_DISCARD:
1098		dst->u.discard.flag = src->u.discard.flag;
1099		dst->u.discard.id = src->u.discard.id;
1100		dst->u.discard.sector_number = src->u.discard.sector_number;
1101		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
1102		break;
1103
1104	case BLKIF_OP_INDIRECT:
1105		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
1106		dst->u.indirect.nr_segments =
1107			READ_ONCE(src->u.indirect.nr_segments);
1108		dst->u.indirect.handle = src->u.indirect.handle;
1109		dst->u.indirect.id = src->u.indirect.id;
1110		dst->u.indirect.sector_number = src->u.indirect.sector_number;
1111		n = min(MAX_INDIRECT_PAGES,
1112			INDIRECT_PAGES(dst->u.indirect.nr_segments));
1113		for (i = 0; i < n; i++)
1114			dst->u.indirect.indirect_grefs[i] =
1115				src->u.indirect.indirect_grefs[i];
1116		break;
1117
1118	default:
1119		/*
1120		 * Don't know how to translate this op. Only get the
1121		 * ID so failure can be reported to the frontend.
1122		 */
1123		dst->u.other.id = src->u.other.id;
1124		break;
1125	}
1126}
1127
1128static void blkif_get_x86_64_req(struct blkif_request *dst,
1129				 const struct blkif_x86_64_request *src)
1130{
1131	unsigned int i, n;
1132
1133	dst->operation = READ_ONCE(src->operation);
1134
1135	switch (dst->operation) {
1136	case BLKIF_OP_READ:
1137	case BLKIF_OP_WRITE:
1138	case BLKIF_OP_WRITE_BARRIER:
1139	case BLKIF_OP_FLUSH_DISKCACHE:
1140		dst->u.rw.nr_segments = READ_ONCE(src->u.rw.nr_segments);
1141		dst->u.rw.handle = src->u.rw.handle;
1142		dst->u.rw.id = src->u.rw.id;
1143		dst->u.rw.sector_number = src->u.rw.sector_number;
1144		n = min_t(unsigned int, BLKIF_MAX_SEGMENTS_PER_REQUEST,
1145			  dst->u.rw.nr_segments);
1146		for (i = 0; i < n; i++)
1147			dst->u.rw.seg[i] = src->u.rw.seg[i];
1148		break;
1149
1150	case BLKIF_OP_DISCARD:
1151		dst->u.discard.flag = src->u.discard.flag;
1152		dst->u.discard.id = src->u.discard.id;
1153		dst->u.discard.sector_number = src->u.discard.sector_number;
1154		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
1155		break;
1156
1157	case BLKIF_OP_INDIRECT:
1158		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
1159		dst->u.indirect.nr_segments =
1160			READ_ONCE(src->u.indirect.nr_segments);
1161		dst->u.indirect.handle = src->u.indirect.handle;
1162		dst->u.indirect.id = src->u.indirect.id;
1163		dst->u.indirect.sector_number = src->u.indirect.sector_number;
1164		n = min(MAX_INDIRECT_PAGES,
1165			INDIRECT_PAGES(dst->u.indirect.nr_segments));
1166		for (i = 0; i < n; i++)
1167			dst->u.indirect.indirect_grefs[i] =
1168				src->u.indirect.indirect_grefs[i];
1169		break;
1170
1171	default:
1172		/*
1173		 * Don't know how to translate this op. Only get the
1174		 * ID so failure can be reported to the frontend.
1175		 */
1176		dst->u.other.id = src->u.other.id;
1177		break;
1178	}
1179}
1180
1181/*
1182 * Function to copy the from the ring buffer the 'struct blkif_request'
1183 * (which has the sectors we want, number of them, grant references, etc),
1184 * and transmute  it to the block API to hand it over to the proper block disk.
1185 */
1186static int
1187__do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
1188{
1189	union blkif_back_rings *blk_rings = &ring->blk_rings;
1190	struct blkif_request req;
1191	struct pending_req *pending_req;
1192	RING_IDX rc, rp;
1193	int more_to_do = 0;
1194
1195	rc = blk_rings->common.req_cons;
1196	rp = blk_rings->common.sring->req_prod;
1197	rmb(); /* Ensure we see queued requests up to 'rp'. */
1198
1199	if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
1200		rc = blk_rings->common.rsp_prod_pvt;
1201		pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
1202			rp, rc, rp - rc, ring->blkif->vbd.pdevice);
1203		return -EACCES;
1204	}
1205	while (rc != rp) {
1206
1207		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
1208			break;
1209
1210		/* We've seen a request, so clear spurious eoi flag. */
1211		*eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS;
1212
1213		if (kthread_should_stop()) {
1214			more_to_do = 1;
1215			break;
1216		}
1217
1218		pending_req = alloc_req(ring);
1219		if (NULL == pending_req) {
1220			ring->st_oo_req++;
1221			more_to_do = 1;
1222			break;
1223		}
1224
1225		switch (ring->blkif->blk_protocol) {
1226		case BLKIF_PROTOCOL_NATIVE:
1227			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
1228			break;
1229		case BLKIF_PROTOCOL_X86_32:
1230			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1231			break;
1232		case BLKIF_PROTOCOL_X86_64:
1233			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1234			break;
1235		default:
1236			BUG();
1237		}
1238		blk_rings->common.req_cons = ++rc; /* before make_response() */
1239
1240		/* Apply all sanity checks to /private copy/ of request. */
1241		barrier();
1242
1243		switch (req.operation) {
1244		case BLKIF_OP_READ:
1245		case BLKIF_OP_WRITE:
1246		case BLKIF_OP_WRITE_BARRIER:
1247		case BLKIF_OP_FLUSH_DISKCACHE:
1248		case BLKIF_OP_INDIRECT:
1249			if (dispatch_rw_block_io(ring, &req, pending_req))
1250				goto done;
1251			break;
1252		case BLKIF_OP_DISCARD:
1253			free_req(ring, pending_req);
1254			if (dispatch_discard_io(ring, &req))
1255				goto done;
1256			break;
1257		default:
1258			if (dispatch_other_io(ring, &req, pending_req))
1259				goto done;
1260			break;
1261		}
1262
1263		/* Yield point for this unbounded loop. */
1264		cond_resched();
1265	}
1266done:
1267	return more_to_do;
1268}
1269
1270static int
1271do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
1272{
1273	union blkif_back_rings *blk_rings = &ring->blk_rings;
1274	int more_to_do;
1275
1276	do {
1277		more_to_do = __do_block_io_op(ring, eoi_flags);
1278		if (more_to_do)
1279			break;
1280
1281		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1282	} while (more_to_do);
1283
1284	return more_to_do;
1285}
1286/*
1287 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
1288 * and call the 'submit_bio' to pass it to the underlying storage.
1289 */
1290static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
1291				struct blkif_request *req,
1292				struct pending_req *pending_req)
1293{
1294	struct phys_req preq;
1295	struct seg_buf *seg = pending_req->seg;
1296	unsigned int nseg;
1297	struct bio *bio = NULL;
1298	struct bio **biolist = pending_req->biolist;
1299	int i, nbio = 0;
1300	enum req_op operation;
1301	blk_opf_t operation_flags = 0;
1302	struct blk_plug plug;
1303	bool drain = false;
1304	struct grant_page **pages = pending_req->segments;
1305	unsigned short req_operation;
1306
1307	req_operation = req->operation == BLKIF_OP_INDIRECT ?
1308			req->u.indirect.indirect_op : req->operation;
1309
1310	if ((req->operation == BLKIF_OP_INDIRECT) &&
1311	    (req_operation != BLKIF_OP_READ) &&
1312	    (req_operation != BLKIF_OP_WRITE)) {
1313		pr_debug("Invalid indirect operation (%u)\n", req_operation);
1314		goto fail_response;
1315	}
1316
1317	switch (req_operation) {
1318	case BLKIF_OP_READ:
1319		ring->st_rd_req++;
1320		operation = REQ_OP_READ;
1321		break;
1322	case BLKIF_OP_WRITE:
1323		ring->st_wr_req++;
1324		operation = REQ_OP_WRITE;
1325		operation_flags = REQ_SYNC | REQ_IDLE;
1326		break;
1327	case BLKIF_OP_WRITE_BARRIER:
1328		drain = true;
1329		fallthrough;
1330	case BLKIF_OP_FLUSH_DISKCACHE:
1331		ring->st_f_req++;
1332		operation = REQ_OP_WRITE;
1333		operation_flags = REQ_PREFLUSH;
1334		break;
1335	default:
1336		operation = 0; /* make gcc happy */
1337		goto fail_response;
1338		break;
1339	}
1340
1341	/* Check that the number of segments is sane. */
1342	nseg = req->operation == BLKIF_OP_INDIRECT ?
1343	       req->u.indirect.nr_segments : req->u.rw.nr_segments;
1344
1345	if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) ||
1346	    unlikely((req->operation != BLKIF_OP_INDIRECT) &&
1347		     (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
1348	    unlikely((req->operation == BLKIF_OP_INDIRECT) &&
1349		     (nseg > MAX_INDIRECT_SEGMENTS))) {
1350		pr_debug("Bad number of segments in request (%d)\n", nseg);
1351		/* Haven't submitted any bio's yet. */
1352		goto fail_response;
1353	}
1354
1355	preq.nr_sects      = 0;
1356
1357	pending_req->ring      = ring;
1358	pending_req->id        = req->u.rw.id;
1359	pending_req->operation = req_operation;
1360	pending_req->status    = BLKIF_RSP_OKAY;
1361	pending_req->nr_segs   = nseg;
1362
1363	if (req->operation != BLKIF_OP_INDIRECT) {
1364		preq.dev               = req->u.rw.handle;
1365		preq.sector_number     = req->u.rw.sector_number;
1366		for (i = 0; i < nseg; i++) {
1367			pages[i]->gref = req->u.rw.seg[i].gref;
1368			seg[i].nsec = req->u.rw.seg[i].last_sect -
1369				req->u.rw.seg[i].first_sect + 1;
1370			seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
1371			if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
1372			    (req->u.rw.seg[i].last_sect <
1373			     req->u.rw.seg[i].first_sect))
1374				goto fail_response;
1375			preq.nr_sects += seg[i].nsec;
1376		}
1377	} else {
1378		preq.dev               = req->u.indirect.handle;
1379		preq.sector_number     = req->u.indirect.sector_number;
1380		if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
1381			goto fail_response;
1382	}
1383
1384	if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
1385		pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
1386			 operation == REQ_OP_READ ? "read" : "write",
1387			 preq.sector_number,
1388			 preq.sector_number + preq.nr_sects,
1389			 ring->blkif->vbd.pdevice);
1390		goto fail_response;
1391	}
1392
1393	/*
1394	 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
1395	 * is set there.
1396	 */
1397	for (i = 0; i < nseg; i++) {
1398		if (((int)preq.sector_number|(int)seg[i].nsec) &
1399		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
1400			pr_debug("Misaligned I/O request from domain %d\n",
1401				 ring->blkif->domid);
1402			goto fail_response;
1403		}
1404	}
1405
1406	/* Wait on all outstanding I/O's and once that has been completed
1407	 * issue the flush.
1408	 */
1409	if (drain)
1410		xen_blk_drain_io(pending_req->ring);
1411
1412	/*
1413	 * If we have failed at this point, we need to undo the M2P override,
1414	 * set gnttab_set_unmap_op on all of the grant references and perform
1415	 * the hypercall to unmap the grants - that is all done in
1416	 * xen_blkbk_unmap.
1417	 */
1418	if (xen_blkbk_map_seg(pending_req))
1419		goto fail_flush;
1420
1421	/*
1422	 * This corresponding xen_blkif_put is done in __end_block_io_op, or
1423	 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
1424	 */
1425	xen_blkif_get(ring->blkif);
1426	atomic_inc(&ring->inflight);
1427
1428	for (i = 0; i < nseg; i++) {
1429		while ((bio == NULL) ||
1430		       (bio_add_page(bio,
1431				     pages[i]->page,
1432				     seg[i].nsec << 9,
1433				     seg[i].offset) == 0)) {
1434			bio = bio_alloc(preq.bdev, bio_max_segs(nseg - i),
1435					operation | operation_flags,
1436					GFP_KERNEL);
1437			biolist[nbio++] = bio;
1438			bio->bi_private = pending_req;
1439			bio->bi_end_io  = end_block_io_op;
1440			bio->bi_iter.bi_sector  = preq.sector_number;
1441		}
1442
1443		preq.sector_number += seg[i].nsec;
1444	}
1445
1446	/* This will be hit if the operation was a flush or discard. */
1447	if (!bio) {
1448		BUG_ON(operation_flags != REQ_PREFLUSH);
1449
1450		bio = bio_alloc(preq.bdev, 0, operation | operation_flags,
1451				GFP_KERNEL);
1452		biolist[nbio++] = bio;
1453		bio->bi_private = pending_req;
1454		bio->bi_end_io  = end_block_io_op;
1455	}
1456
1457	atomic_set(&pending_req->pendcnt, nbio);
1458	blk_start_plug(&plug);
1459
1460	for (i = 0; i < nbio; i++)
1461		submit_bio(biolist[i]);
1462
1463	/* Let the I/Os go.. */
1464	blk_finish_plug(&plug);
1465
1466	if (operation == REQ_OP_READ)
1467		ring->st_rd_sect += preq.nr_sects;
1468	else if (operation == REQ_OP_WRITE)
1469		ring->st_wr_sect += preq.nr_sects;
1470
1471	return 0;
1472
1473 fail_flush:
1474	xen_blkbk_unmap(ring, pending_req->segments,
1475	                pending_req->nr_segs);
1476 fail_response:
1477	/* Haven't submitted any bio's yet. */
1478	make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
1479	free_req(ring, pending_req);
1480	msleep(1); /* back off a bit */
1481	return -EIO;
1482}
1483
1484
1485
1486/*
1487 * Put a response on the ring on how the operation fared.
1488 */
1489static void make_response(struct xen_blkif_ring *ring, u64 id,
1490			  unsigned short op, int st)
1491{
1492	struct blkif_response *resp;
1493	unsigned long     flags;
1494	union blkif_back_rings *blk_rings;
1495	int notify;
1496
1497	spin_lock_irqsave(&ring->blk_ring_lock, flags);
1498	blk_rings = &ring->blk_rings;
1499	/* Place on the response ring for the relevant domain. */
1500	switch (ring->blkif->blk_protocol) {
1501	case BLKIF_PROTOCOL_NATIVE:
1502		resp = RING_GET_RESPONSE(&blk_rings->native,
1503					 blk_rings->native.rsp_prod_pvt);
1504		break;
1505	case BLKIF_PROTOCOL_X86_32:
1506		resp = RING_GET_RESPONSE(&blk_rings->x86_32,
1507					 blk_rings->x86_32.rsp_prod_pvt);
1508		break;
1509	case BLKIF_PROTOCOL_X86_64:
1510		resp = RING_GET_RESPONSE(&blk_rings->x86_64,
1511					 blk_rings->x86_64.rsp_prod_pvt);
1512		break;
1513	default:
1514		BUG();
1515	}
1516
1517	resp->id        = id;
1518	resp->operation = op;
1519	resp->status    = st;
1520
1521	blk_rings->common.rsp_prod_pvt++;
1522	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1523	spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
1524	if (notify)
1525		notify_remote_via_irq(ring->irq);
1526}
1527
1528static int __init xen_blkif_init(void)
1529{
1530	int rc = 0;
1531
1532	if (!xen_domain())
1533		return -ENODEV;
1534
1535	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
1536		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
1537			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
1538		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
1539	}
1540
1541	if (xenblk_max_queues == 0)
1542		xenblk_max_queues = num_online_cpus();
1543
1544	rc = xen_blkif_interface_init();
1545	if (rc)
1546		goto failed_init;
1547
1548	rc = xen_blkif_xenbus_init();
1549	if (rc)
1550		goto failed_init;
1551
1552 failed_init:
1553	return rc;
1554}
1555
1556module_init(xen_blkif_init);
1557
1558static void __exit xen_blkif_fini(void)
1559{
1560	xen_blkif_xenbus_fini();
1561	xen_blkif_interface_fini();
1562}
1563
1564module_exit(xen_blkif_fini);
1565
1566MODULE_LICENSE("Dual BSD/GPL");
1567MODULE_ALIAS("xen-backend:vbd");
1568