1/* Minimal block driver for Mini-OS.
2 * Copyright (c) 2007-2008 Samuel Thibault.
3 * Based on netfront.c.
4 */
5
6#include <mini-os/os.h>
7#include <mini-os/xenbus.h>
8#include <mini-os/events.h>
9#include <mini-os/gnttab.h>
10#include <mini-os/blkfront.h>
11
12#include <xen/io/blkif.h>
13#include <xen/io/protocols.h>
14
15#include <bmk-core/errno.h>
16#include <bmk-core/memalloc.h>
17#include <bmk-core/pgalloc.h>
18#include <bmk-core/printf.h>
19#include <bmk-core/string.h>
20
21/* SHARED_RING_INIT() uses memset() */
22#define memset(a,b,c) bmk_memset(a,b,c)
23
24/* Note: we generally don't need to disable IRQs since we hardly do anything in
25 * the interrupt handler.  */
26
27/* Note: we really suppose non-preemptive threads.  */
28
29DECLARE_WAIT_QUEUE_HEAD(blkfront_queue);
30
31#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
32#define GRANT_INVALID_REF 0
33
34struct blk_buffer {
35    void* page;
36    grant_ref_t gref;
37};
38
39struct blkfront_dev {
40    domid_t dom;
41
42    struct blkif_front_ring ring;
43    grant_ref_t ring_ref;
44    evtchn_port_t evtchn;
45    blkif_vdev_t handle;
46
47    char nodename[64];
48    char *backend;
49    struct blkfront_info info;
50
51    struct xenbus_event_queue events;
52
53};
54
55void blkfront_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
56{
57    minios_wake_up(&blkfront_queue);
58}
59
60static void free_blkfront(struct blkfront_dev *dev)
61{
62    minios_mask_evtchn(dev->evtchn);
63
64    bmk_memfree(dev->backend, BMK_MEMWHO_WIREDBMK);
65
66    gnttab_end_access(dev->ring_ref);
67    bmk_pgfree_one(dev->ring.sring);
68
69    minios_unbind_evtchn(dev->evtchn);
70
71    bmk_memfree(dev, BMK_MEMWHO_WIREDBMK);
72}
73
74struct blkfront_dev *blkfront_init(char *_nodename, struct blkfront_info *info)
75{
76    xenbus_transaction_t xbt;
77    char* err;
78    char* message=NULL;
79    struct blkif_sring *s;
80    int retry=0;
81    char* msg = NULL;
82    char* c;
83    char* nodename = _nodename ? _nodename : "device/vbd/768";
84    unsigned long len;
85
86    struct blkfront_dev *dev;
87
88    char path[bmk_strlen(nodename) + 1 + 10 + 1];
89
90    dev = bmk_memcalloc(1, sizeof(*dev), BMK_MEMWHO_WIREDBMK);
91    bmk_strncpy(dev->nodename, nodename, sizeof(dev->nodename)-1);
92
93    bmk_snprintf(path, sizeof(path), "%s/backend-id", nodename);
94    dev->dom = xenbus_read_integer(path);
95    minios_evtchn_alloc_unbound(dev->dom, blkfront_handler, dev, &dev->evtchn);
96
97    s = bmk_pgalloc_one();
98    bmk_memset(s,0,PAGE_SIZE);
99
100
101    SHARED_RING_INIT(s);
102    FRONT_RING_INIT(&dev->ring, s, PAGE_SIZE);
103
104    dev->ring_ref = gnttab_grant_access(dev->dom,virt_to_mfn(s),0);
105
106    xenbus_event_queue_init(&dev->events);
107
108again:
109    err = xenbus_transaction_start(&xbt);
110    if (err) {
111        minios_printk("starting transaction\n");
112        bmk_memfree(err, BMK_MEMWHO_WIREDBMK);
113    }
114
115    err = xenbus_printf(xbt, nodename, "ring-ref","%u",
116                dev->ring_ref);
117    if (err) {
118        message = "writing ring-ref";
119        goto abort_transaction;
120    }
121    err = xenbus_printf(xbt, nodename,
122                "event-channel", "%u", dev->evtchn);
123    if (err) {
124        message = "writing event-channel";
125        goto abort_transaction;
126    }
127    err = xenbus_printf(xbt, nodename,
128                "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE);
129    if (err) {
130        message = "writing protocol";
131        goto abort_transaction;
132    }
133
134    bmk_snprintf(path, sizeof(path), "%s/state", nodename);
135    err = xenbus_switch_state(xbt, path, XenbusStateConnected);
136    if (err) {
137        message = "switching state";
138        goto abort_transaction;
139    }
140
141
142    err = xenbus_transaction_end(xbt, 0, &retry);
143    if (err) bmk_memfree(err, BMK_MEMWHO_WIREDBMK);
144    if (retry) {
145            goto again;
146        minios_printk("completing transaction\n");
147    }
148
149    goto done;
150
151abort_transaction:
152    bmk_memfree(err, BMK_MEMWHO_WIREDBMK);
153    err = xenbus_transaction_end(xbt, 1, &retry);
154    minios_printk("Abort transaction %s\n", message);
155    goto error;
156
157done:
158
159    bmk_snprintf(path, sizeof(path), "%s/backend", nodename);
160    msg = xenbus_read(XBT_NIL, path, &dev->backend);
161    if (msg) {
162        minios_printk("Error %s when reading the backend path %s\n", msg, path);
163        goto error;
164    }
165
166    minios_printk("blkfront: node=%s backend=%s\n", nodename, dev->backend);
167
168    len = bmk_strlen(nodename);
169    dev->handle = bmk_strtoul((char *)bmk_memrchr(nodename+len, '/', len)+1, NULL, 10);
170
171    {
172        XenbusState state;
173        char path[bmk_strlen(dev->backend) + 1 + 19 + 1];
174        bmk_snprintf(path, sizeof(path), "%s/mode", dev->backend);
175        msg = xenbus_read(XBT_NIL, path, &c);
176        if (msg) {
177            minios_printk("Error %s when reading the mode\n", msg);
178            goto error;
179        }
180        if (*c == 'w')
181            dev->info.mode = BLKFRONT_RDWR;
182        else
183            dev->info.mode = BLKFRONT_RDONLY;
184        bmk_memfree(c, BMK_MEMWHO_WIREDBMK);
185
186        bmk_snprintf(path, sizeof(path), "%s/state", dev->backend);
187
188        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
189
190        msg = NULL;
191        state = xenbus_read_integer(path);
192        while (msg == NULL && state < XenbusStateConnected)
193            msg = xenbus_wait_for_state_change(path, &state, &dev->events);
194        if (msg != NULL || state != XenbusStateConnected) {
195            minios_printk("backend not available, state=%d\n", state);
196            xenbus_unwatch_path_token(XBT_NIL, path, path);
197            goto error;
198        }
199
200        bmk_snprintf(path, sizeof(path), "%s/info", dev->backend);
201        dev->info.info = xenbus_read_integer(path);
202
203        bmk_snprintf(path, sizeof(path), "%s/sectors", dev->backend);
204        // FIXME: read_integer returns an int, so disk size limited to 1TB for now
205        dev->info.sectors = xenbus_read_integer(path);
206
207        bmk_snprintf(path, sizeof(path), "%s/sector-size", dev->backend);
208        dev->info.sector_size = xenbus_read_integer(path);
209
210        bmk_snprintf(path, sizeof(path), "%s/feature-barrier", dev->backend);
211        dev->info.barrier = xenbus_read_integer(path);
212
213        bmk_snprintf(path, sizeof(path), "%s/feature-flush-cache", dev->backend);
214        dev->info.flush = xenbus_read_integer(path);
215
216        *info = dev->info;
217    }
218    minios_unmask_evtchn(dev->evtchn);
219
220    minios_printk("blkfront: %u sectors\n", dev->info.sectors);
221
222    return dev;
223
224error:
225    bmk_memfree(msg, BMK_MEMWHO_WIREDBMK);
226    bmk_memfree(err, BMK_MEMWHO_WIREDBMK);
227    free_blkfront(dev);
228    return NULL;
229}
230
231void blkfront_shutdown(struct blkfront_dev *dev)
232{
233    char* err = NULL;
234    XenbusState state;
235
236    char path[bmk_strlen(dev->backend) + 1 + 5 + 1];
237    char nodename[bmk_strlen(dev->nodename) + 1 + 5 + 1];
238
239    blkfront_sync(dev);
240
241    minios_printk("blkfront detached: node=%s\n", dev->nodename);
242
243    bmk_snprintf(path, sizeof(path), "%s/state", dev->backend);
244    bmk_snprintf(nodename, sizeof(nodename), "%s/state", dev->nodename);
245
246    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosing)) != NULL) {
247        minios_printk("shutdown_blkfront: error changing state to %d: %s\n",
248                XenbusStateClosing, err);
249        goto close;
250    }
251    state = xenbus_read_integer(path);
252    while (err == NULL && state < XenbusStateClosing)
253        err = xenbus_wait_for_state_change(path, &state, &dev->events);
254    if (err) bmk_memfree(err, BMK_MEMWHO_WIREDBMK);
255
256    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateClosed)) != NULL) {
257        minios_printk("shutdown_blkfront: error changing state to %d: %s\n",
258                XenbusStateClosed, err);
259        goto close;
260    }
261    state = xenbus_read_integer(path);
262    while (state < XenbusStateClosed) {
263        err = xenbus_wait_for_state_change(path, &state, &dev->events);
264        if (err) bmk_memfree(err, BMK_MEMWHO_WIREDBMK);
265    }
266
267    if ((err = xenbus_switch_state(XBT_NIL, nodename, XenbusStateInitialising)) != NULL) {
268        minios_printk("shutdown_blkfront: error changing state to %d: %s\n",
269                XenbusStateInitialising, err);
270        goto close;
271    }
272    err = NULL;
273    state = xenbus_read_integer(path);
274    while (err == NULL && (state < XenbusStateInitWait || state >= XenbusStateClosed))
275        err = xenbus_wait_for_state_change(path, &state, &dev->events);
276
277close:
278    if (err) bmk_memfree(err, BMK_MEMWHO_WIREDBMK);
279    xenbus_unwatch_path_token(XBT_NIL, path, path);
280
281    bmk_snprintf(path, sizeof(path), "%s/ring-ref", nodename);
282    xenbus_rm(XBT_NIL, path);
283    bmk_snprintf(path, sizeof(path), "%s/event-channel", nodename);
284    xenbus_rm(XBT_NIL, path);
285
286    if (!err)
287        free_blkfront(dev);
288}
289
290static void blkfront_wait_slot(struct blkfront_dev *dev)
291{
292    /* Wait for a slot */
293    if (RING_FULL(&dev->ring)) {
294	unsigned long flags;
295	DEFINE_WAIT(w);
296	local_irq_save(flags);
297	while (1) {
298	    blkfront_aio_poll(dev);
299	    if (!RING_FULL(&dev->ring))
300		break;
301	    /* Really no slot, go to sleep. */
302	    minios_add_waiter(w, blkfront_queue);
303	    local_irq_restore(flags);
304	    minios_wait(w);
305	    local_irq_save(flags);
306	}
307	minios_remove_waiter(w, blkfront_queue);
308	local_irq_restore(flags);
309    }
310}
311
312/* Issue an aio */
313void blkfront_aio(struct blkfront_aiocb *aiocbp, int write)
314{
315    struct blkfront_dev *dev = aiocbp->aio_dev;
316    struct blkif_request *req;
317    RING_IDX i;
318    int notify;
319    int n, j;
320    uintptr_t start, end;
321
322    // Can't io at non-sector-aligned location
323    ASSERT(!(aiocbp->aio_offset & (dev->info.sector_size-1)));
324    // Can't io non-sector-sized amounts
325    ASSERT(!(aiocbp->aio_nbytes & (dev->info.sector_size-1)));
326    // Can't io non-sector-aligned buffer
327    ASSERT(!((uintptr_t) aiocbp->aio_buf & (dev->info.sector_size-1)));
328
329    start = (uintptr_t)aiocbp->aio_buf & PAGE_MASK;
330    end = ((uintptr_t)aiocbp->aio_buf + aiocbp->aio_nbytes + PAGE_SIZE - 1) & PAGE_MASK;
331    aiocbp->n = n = (end - start) / PAGE_SIZE;
332
333    /* qemu's IDE max multsect is 16 (8KB) and SCSI max DMA was set to 32KB,
334     * so max 44KB can't happen */
335    ASSERT(n <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
336
337    blkfront_wait_slot(dev);
338    i = dev->ring.req_prod_pvt;
339    req = RING_GET_REQUEST(&dev->ring, i);
340
341    req->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ;
342    req->nr_segments = n;
343    req->handle = dev->handle;
344    req->id = (uintptr_t) aiocbp;
345    req->sector_number = aiocbp->aio_offset / 512;
346
347    for (j = 0; j < n; j++) {
348        req->seg[j].first_sect = 0;
349        req->seg[j].last_sect = PAGE_SIZE / 512 - 1;
350    }
351    req->seg[0].first_sect = ((uintptr_t)aiocbp->aio_buf & ~PAGE_MASK) / 512;
352    req->seg[n-1].last_sect = (((uintptr_t)aiocbp->aio_buf + aiocbp->aio_nbytes - 1) & ~PAGE_MASK) / 512;
353    for (j = 0; j < n; j++) {
354	uintptr_t data = start + j * PAGE_SIZE;
355        if (!write) {
356            /* Trigger CoW if needed */
357            *(char*)(data + (req->seg[j].first_sect << 9)) = 0;
358            barrier();
359        }
360	aiocbp->gref[j] = req->seg[j].gref =
361            gnttab_grant_access(dev->dom, virtual_to_mfn(data), write);
362    }
363
364    dev->ring.req_prod_pvt = i + 1;
365
366    wmb();
367    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->ring, notify);
368
369    if(notify) minios_notify_remote_via_evtchn(dev->evtchn);
370}
371
372static void blkfront_aio_cb(struct blkfront_aiocb *aiocbp, int ret)
373{
374    aiocbp->data = (void*) 1;
375}
376
377void blkfront_io(struct blkfront_aiocb *aiocbp, int write)
378{
379    unsigned long flags;
380    DEFINE_WAIT(w);
381
382    ASSERT(!aiocbp->aio_cb);
383    aiocbp->aio_cb = blkfront_aio_cb;
384    blkfront_aio(aiocbp, write);
385    aiocbp->data = NULL;
386
387    local_irq_save(flags);
388    while (1) {
389	blkfront_aio_poll(aiocbp->aio_dev);
390	if (aiocbp->data)
391	    break;
392
393	minios_add_waiter(w, blkfront_queue);
394	local_irq_restore(flags);
395	minios_wait(w);
396	local_irq_save(flags);
397    }
398    minios_remove_waiter(w, blkfront_queue);
399    local_irq_restore(flags);
400}
401
402static void blkfront_push_operation(struct blkfront_dev *dev, uint8_t op, uint64_t id)
403{
404    int i;
405    struct blkif_request *req;
406    int notify;
407
408    blkfront_wait_slot(dev);
409    i = dev->ring.req_prod_pvt;
410    req = RING_GET_REQUEST(&dev->ring, i);
411    req->operation = op;
412    req->nr_segments = 0;
413    req->handle = dev->handle;
414    req->id = id;
415    /* Not needed anyway, but the backend will check it */
416    req->sector_number = 0;
417    dev->ring.req_prod_pvt = i + 1;
418    wmb();
419    RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&dev->ring, notify);
420    if (notify) minios_notify_remote_via_evtchn(dev->evtchn);
421}
422
423void blkfront_aio_push_operation(struct blkfront_aiocb *aiocbp, uint8_t op)
424{
425    struct blkfront_dev *dev = aiocbp->aio_dev;
426    blkfront_push_operation(dev, op, (uintptr_t) aiocbp);
427}
428
429void blkfront_sync(struct blkfront_dev *dev)
430{
431    unsigned long flags;
432    DEFINE_WAIT(w);
433
434    if (dev->info.mode == BLKFRONT_RDWR) {
435        if (dev->info.barrier == 1)
436            blkfront_push_operation(dev, BLKIF_OP_WRITE_BARRIER, 0);
437
438        if (dev->info.flush == 1)
439            blkfront_push_operation(dev, BLKIF_OP_FLUSH_DISKCACHE, 0);
440    }
441
442    /* Note: This won't finish if another thread enqueues requests.  */
443    local_irq_save(flags);
444    while (1) {
445	blkfront_aio_poll(dev);
446	if (RING_FREE_REQUESTS(&dev->ring) == RING_SIZE(&dev->ring))
447	    break;
448
449	minios_add_waiter(w, blkfront_queue);
450	local_irq_restore(flags);
451	minios_wait(w);
452	local_irq_save(flags);
453    }
454    minios_remove_waiter(w, blkfront_queue);
455    local_irq_restore(flags);
456}
457
458int blkfront_aio_poll(struct blkfront_dev *dev)
459{
460    RING_IDX rp, cons;
461    struct blkif_response *rsp;
462    int more;
463    int nr_consumed;
464
465moretodo:
466
467    rp = dev->ring.sring->rsp_prod;
468    rmb(); /* Ensure we see queued responses up to 'rp'. */
469    cons = dev->ring.rsp_cons;
470
471    nr_consumed = 0;
472    while ((cons != rp))
473    {
474        struct blkfront_aiocb *aiocbp;
475        int status;
476
477	rsp = RING_GET_RESPONSE(&dev->ring, cons);
478	nr_consumed++;
479
480        aiocbp = (void*) (uintptr_t) rsp->id;
481        status = rsp->status;
482
483        if (status != BLKIF_RSP_OKAY)
484            minios_printk("block error %d for op %d\n", status, rsp->operation);
485
486        switch (rsp->operation) {
487        case BLKIF_OP_READ:
488        case BLKIF_OP_WRITE:
489        {
490            int j;
491
492            for (j = 0; j < aiocbp->n; j++)
493                gnttab_end_access(aiocbp->gref[j]);
494
495            break;
496        }
497
498        case BLKIF_OP_WRITE_BARRIER:
499        case BLKIF_OP_FLUSH_DISKCACHE:
500            break;
501
502        default:
503            minios_printk("unrecognized block operation %d response\n", rsp->operation);
504        }
505
506        dev->ring.rsp_cons = ++cons;
507        /* Nota: callback frees aiocbp itself */
508        if (aiocbp && aiocbp->aio_cb)
509            aiocbp->aio_cb(aiocbp, status ? -BMK_EIO : 0);
510        if (dev->ring.rsp_cons != cons)
511            /* We reentered, we must not continue here */
512            break;
513    }
514
515    RING_FINAL_CHECK_FOR_RESPONSES(&dev->ring, more);
516    if (more) goto moretodo;
517
518    return nr_consumed;
519}
520