1/*
2   This file contains the global device cache for the BeOS.  All
3   file system I/O comes through here.  The cache can handle blocks
4   of different sizes for multiple different underlying physical
5   devices.
6
7   The cache is organized as a hash table (for lookups by device
8   and block number) and two doubly-linked lists.  The normal
9   list is for "normal" blocks which are either clean or dirty.
10   The locked list is for blocks that are locked in the cache by
11   BFS.  The lists are LRU ordered.
12
13   Most of the work happens in the function cache_block_io() which
14   is quite lengthy.  The other functions of interest are get_ents()
15   which picks victims to be kicked out of the cache; flush_ents()
16   which does the work of flushing them; and set_blocks_info() which
17   handles cloning blocks and setting callbacks so that the BFS
18   journal will work properly.  If you want to modify this code it
19   will take some study but it's not too bad.  Do not think about
20   separating the list of clean and dirty blocks into two lists as
21   I did that already and it's slower.
22
23   Originally this cache code was written while listening to the album
24   "Ride the Lightning" by Metallica.  The current version was written
25   while listening to "Welcome to SkyValley" by Kyuss as well as an
26   ambient collection on the German label FAX.  Music helps a lot when
27   writing code.
28
29   THIS CODE COPYRIGHT DOMINIC GIAMPAOLO.  NO WARRANTY IS EXPRESSED
30   OR IMPLIED.  YOU MAY USE THIS CODE AND FREELY DISTRIBUTE IT FOR
31   NON-COMMERCIAL USE AS LONG AS THIS NOTICE REMAINS ATTACHED.
32
33   FOR COMMERCIAL USE, CONTACT DOMINIC GIAMPAOLO (dbg@be.com).
34
35   Dominic Giampaolo
36   dbg@be.com
37*/
38
39#include <stdio.h>
40#include <stdlib.h>
41#include <memory.h>
42#include <string.h>
43#include <errno.h>
44#include <fcntl.h>
45#include <sys/types.h>
46#include <unistd.h>
47
48
49#if (defined(__BEOS__) || defined(__HAIKU__))
50#include <OS.h>
51#include <KernelExport.h>
52#endif
53
54#include "compat.h"
55#include "lock.h"
56#include "cache.h"
57
58
59
60#ifndef USER
61#define printf dprintf
62#endif
63#ifdef USER
64#define kprintf printf
65#endif
66
67
68/* forward prototypes */
69static int   flush_ents(cache_ent **ents, int n_ents);
70
71//static int   do_dump(int argc, char **argv);
72//static int   do_find_block(int argc, char **argv);
73//static int   do_find_data(int argc, char **argv);
74//static void  cache_flusher(void *arg, int phase);
75
76
77int chatty_io = 0;
78
79#define CHUNK (512 * 1024)   /* a hack to work around scsi driver bugs */
80
81size_t
82read_phys_blocks(int fd, fs_off_t bnum, void *data, uint num_blocks, int bsize)
83{
84    size_t ret = 0;
85    size_t sum;
86
87    if (chatty_io)
88        printf("R: %8Ld : %3d\n", bnum, num_blocks);
89
90    if (num_blocks * bsize < CHUNK)
91        ret = read_pos(fd, bnum * bsize, data, num_blocks * bsize);
92    else {
93        for(sum=0; (sum + CHUNK) <= (num_blocks * bsize); sum += CHUNK) {
94            ret = read_pos(fd, (bnum * bsize) + sum, data, CHUNK);
95            if (ret != CHUNK)
96                break;
97
98            data = (void *)((char *)data + CHUNK);
99        }
100
101        if (ret == CHUNK && ((num_blocks * bsize) - sum) > 0) {
102            ret = read_pos(fd, (bnum * bsize) + sum, data,
103                           (num_blocks * bsize) - sum);
104
105            if (ret == (num_blocks * bsize) - sum)
106                ret = num_blocks * bsize;
107        } else if (ret == CHUNK) {
108            ret = num_blocks * bsize;
109        }
110    }
111
112    if (ret == num_blocks * bsize)
113        return 0;
114    else
115        return EBADF;
116}
117
118size_t
119write_phys_blocks(int fd, fs_off_t bnum, void *data, uint num_blocks, int bsize)
120{
121    size_t ret = 0;
122    size_t sum;
123
124    if (chatty_io)
125        printf("W: %8Ld : %3d\n", bnum, num_blocks);
126
127    if (num_blocks * bsize < CHUNK)
128        ret = write_pos(fd, bnum * bsize, data, num_blocks * bsize);
129    else {
130        for(sum=0; (sum + CHUNK) <= (num_blocks * bsize); sum += CHUNK) {
131            ret = write_pos(fd, (bnum * bsize) + sum, data, CHUNK);
132            if (ret != CHUNK)
133                break;
134
135            data = (void *)((char *)data + CHUNK);
136        }
137
138        if (ret == CHUNK && ((num_blocks * bsize) - sum) > 0) {
139            ret = write_pos(fd, (bnum * bsize) + sum, data,
140                            (num_blocks * bsize) - sum);
141
142            if (ret == (num_blocks * bsize) - sum)
143                ret = num_blocks * bsize;
144        } else if (ret == CHUNK) {
145            ret = num_blocks * bsize;
146        }
147    }
148
149
150    if (ret == num_blocks * bsize)
151        return 0;
152    else
153        return EBADF;
154}
155
156//	#pragma mark -
157
158static int
159init_hash_table(hash_table *ht)
160{
161    ht->max   = HT_DEFAULT_MAX;
162    ht->mask  = ht->max - 1;
163    ht->num_elements = 0;
164
165    ht->table = (hash_ent **)calloc(ht->max, sizeof(hash_ent *));
166    if (ht->table == NULL)
167        return ENOMEM;
168
169    return 0;
170}
171
172
173static void
174shutdown_hash_table(hash_table *ht)
175{
176    int       i, hash_len;
177    hash_ent *he, *next;
178
179    for(i=0; i < ht->max; i++) {
180        he = ht->table[i];
181
182        for(hash_len=0; he; hash_len++, he=next) {
183            next = he->next;
184            free(he);
185        }
186    }
187
188    if (ht->table)
189        free(ht->table);
190    ht->table = NULL;
191}
192
193#if 0
194static void
195print_hash_stats(hash_table *ht)
196{
197    int       i, hash_len, max = -1, sum = 0;
198    hash_ent *he, *next;
199
200    for(i=0; i < ht->max; i++) {
201        he = ht->table[i];
202
203        for(hash_len=0; he; hash_len++, he=next) {
204            next = he->next;
205        }
206        if (hash_len)
207            printf("bucket %3d : %3d\n", i, hash_len);
208
209        sum += hash_len;
210        if (hash_len > max)
211            max = hash_len;
212    }
213
214    printf("max # of chains: %d,  average chain length %d\n", max,sum/ht->max);
215}
216#endif
217
218#define HASH(d, b)   ((((fs_off_t)d) << (sizeof(fs_off_t)*8 - 6)) | (b))
219
220static hash_ent *
221new_hash_ent(int dev, fs_off_t bnum, void *data)
222{
223    hash_ent *he;
224
225    he = (hash_ent *)malloc(sizeof(*he));
226    if (he == NULL)
227        return NULL;
228
229    he->hash_val = HASH(dev, bnum);
230    he->dev      = dev;
231    he->bnum     = bnum;
232    he->data     = data;
233    he->next     = NULL;
234
235    return he;
236}
237
238
239static int
240grow_hash_table(hash_table *ht)
241{
242    int        i, omax, newsize, newmask;
243    fs_off_t      hash;
244    hash_ent **new_table, *he, *next;
245
246    if (ht->max & ht->mask) {
247        printf("*** hashtable size %d or mask %d looks weird!\n", ht->max,
248               ht->mask);
249    }
250
251    omax    = ht->max;
252    newsize = omax * 2;        /* have to grow in powers of two */
253    newmask = newsize - 1;
254
255    new_table = (hash_ent **)calloc(newsize, sizeof(hash_ent *));
256    if (new_table == NULL)
257        return ENOMEM;
258
259    for(i=0; i < omax; i++) {
260        for(he=ht->table[i]; he; he=next) {
261            hash = he->hash_val & newmask;
262            next = he->next;
263
264            he->next        = new_table[hash];
265            new_table[hash] = he;
266        }
267    }
268
269    free(ht->table);
270    ht->table = new_table;
271    ht->max   = newsize;
272    ht->mask  = newmask;
273
274    return 0;
275}
276
277
278
279
280static int
281hash_insert(hash_table *ht, int dev, fs_off_t bnum, void *data)
282{
283    fs_off_t    hash;
284    hash_ent *he, *curr;
285
286    hash = HASH(dev, bnum) & ht->mask;
287
288    curr = ht->table[hash];
289    for(; curr != NULL; curr=curr->next)
290        if (curr->dev == dev && curr->bnum == bnum)
291            break;
292
293    if (curr && curr->dev == dev && curr->bnum == bnum) {
294        printf("entry %d:%Ld already in the hash table!\n", dev, bnum);
295        return EEXIST;
296    }
297
298    he = new_hash_ent(dev, bnum, data);
299    if (he == NULL)
300        return ENOMEM;
301
302    he->next        = ht->table[hash];
303    ht->table[hash] = he;
304
305    ht->num_elements++;
306    if (ht->num_elements >= ((ht->max * 3) / 4)) {
307        if (grow_hash_table(ht) != 0)
308            return ENOMEM;
309    }
310
311    return 0;
312}
313
314static void *
315hash_lookup(hash_table *ht, int dev, fs_off_t bnum)
316{
317    hash_ent *he;
318
319    he = ht->table[HASH(dev, bnum) & ht->mask];
320
321    for(; he != NULL; he=he->next) {
322        if (he->dev == dev && he->bnum == bnum)
323            break;
324    }
325
326    if (he)
327        return he->data;
328    else
329        return NULL;
330}
331
332
333static void *
334hash_delete(hash_table *ht, int dev, fs_off_t bnum)
335{
336    void     *data;
337    fs_off_t     hash;
338    hash_ent *he, *prev = NULL;
339
340    hash = HASH(dev, bnum) & ht->mask;
341    he = ht->table[hash];
342
343    for(; he != NULL; prev=he,he=he->next) {
344        if (he->dev == dev && he->bnum == bnum)
345            break;
346    }
347
348    if (he == NULL) {
349        printf("*** hash_delete: tried to delete non-existent block %d:%Ld\n",
350               dev, bnum);
351        return NULL;
352    }
353
354    data = he->data;
355
356    if (ht->table[hash] == he)
357        ht->table[hash] = he->next;
358    else if (prev)
359        prev->next = he->next;
360    else
361        panic("hash table is inconsistent\n");
362
363    free(he);
364    ht->num_elements--;
365
366    return data;
367}
368
369//	#pragma mark -
370
371/*
372  These are the global variables for the cache.
373*/
374static block_cache  bc;
375
376#define       MAX_IOVECS  64           /* # of iovecs for use by cache code */
377static lock   iovec_lock;
378static struct iovec *iovec_pool[MAX_IOVECS];  /* each ptr is to an array of iovecs */
379static int    iovec_used[MAX_IOVECS];  /* non-zero == iovec is in use */
380
381#define NUM_FLUSH_BLOCKS 64    /* size of the iovec array pointed by each ptr */
382
383
384#define DEFAULT_READ_AHEAD_SIZE  (32 * 1024)
385static int read_ahead_size = DEFAULT_READ_AHEAD_SIZE;
386
387/* this array stores the size of each device so we can error check requests */
388#define MAX_DEVICES  256
389fs_off_t max_device_blocks[MAX_DEVICES];
390
391
392/* has the time of the last cache access so cache flushing doesn't interfere */
393static bigtime_t last_cache_access = 0;
394
395
396int
397init_block_cache(int max_blocks, int flags)
398{
399    memset(&bc, 0, sizeof(bc));
400    memset(iovec_pool, 0, sizeof(iovec_pool));
401    memset(iovec_used, 0, sizeof(iovec_used));
402    memset(&max_device_blocks, 0, sizeof(max_device_blocks));
403
404    if (init_hash_table(&bc.ht) != 0)
405        return ENOMEM;
406
407    bc.lock.s = iovec_lock.s = -1;
408
409    bc.max_blocks = max_blocks;
410    bc.flags      = flags;
411    if (new_lock(&bc.lock, "bollockcache") != 0)
412        goto err;
413
414    if (new_lock(&iovec_lock, "iovec_lock") != 0)
415        goto err;
416
417    /* allocate two of these up front so vm won't accidently re-enter itself */
418    iovec_pool[0] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS);
419    iovec_pool[1] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS);
420
421#ifndef USER
422#ifdef DEBUG
423    add_debugger_command("bcache", do_dump, "dump the block cache list");
424    add_debugger_command("fblock", do_find_block, "find a block in the cache");
425    add_debugger_command("fdata",  do_find_data, "find a data block ptr in the cache");
426#endif
427    register_kernel_daemon(cache_flusher, NULL, 3);
428#endif
429
430    return 0;
431
432 err:
433    if (bc.lock.s >= 0)
434        free_lock(&bc.lock);
435
436    if (iovec_lock.s >= 0)
437        free_lock(&iovec_lock);
438
439    shutdown_hash_table(&bc.ht);
440    memset((void *)&bc, 0, sizeof(bc));
441    return ENOMEM;
442}
443
444
445static struct iovec *
446get_iovec_array(void)
447{
448    int i;
449    struct iovec *iov;
450
451    LOCK(iovec_lock);
452
453    for(i=0; i < MAX_IOVECS; i++) {
454        if (iovec_used[i] == 0)
455            break;
456    }
457
458    if (i >= MAX_IOVECS)       /* uh-oh */
459        panic("cache: ran out of iovecs (pool 0x%x, used 0x%x)!\n",
460              &iovec_pool[0], &iovec_used[0]);
461
462    if (iovec_pool[i] == NULL) {
463        iovec_pool[i] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS);
464        if (iovec_pool == NULL)
465            panic("can't allocate an iovec!\n");
466    }
467
468    iov = iovec_pool[i];
469    iovec_used[i] = 1;
470
471    UNLOCK(iovec_lock);
472
473    return iov;
474}
475
476
477static void
478release_iovec_array(struct iovec *iov)
479{
480    int i;
481
482    LOCK(iovec_lock);
483
484    for(i=0; i < MAX_IOVECS; i++) {
485        if (iov == iovec_pool[i])
486            break;
487    }
488
489    if (i < MAX_IOVECS)
490        iovec_used[i] = 0;
491    else                     /* uh-oh */
492        printf("cache: released an iovec I don't own (iov %p)\n", iov);
493
494
495    UNLOCK(iovec_lock);
496}
497
498
499
500
501static void
502real_dump_cache_list(cache_ent_list *cel)
503{
504    cache_ent *ce;
505
506    kprintf("starting from LRU end:\n");
507
508    for (ce = cel->lru; ce; ce = ce->next) {
509        kprintf("ce %p dev %2d bnum %6Ld lock %d flag %d arg %p "
510               "clone %p\n", ce, ce->dev, ce->block_num, ce->lock,
511               ce->flags, ce->arg, ce->clone);
512    }
513    kprintf("MRU end\n");
514}
515
516static void
517dump_cache_list(void)
518{
519    kprintf("NORMAL BLOCKS\n");
520    real_dump_cache_list(&bc.normal);
521
522    kprintf("LOCKED BLOCKS\n");
523    real_dump_cache_list(&bc.locked);
524
525    kprintf("cur blocks %d, max blocks %d ht @ %p\n", bc.cur_blocks,
526           bc.max_blocks, &bc.ht);
527}
528
529#if 0
530static void
531check_bcache(char *str)
532{
533    int count = 0;
534    cache_ent *ce, *prev = NULL;
535
536    LOCK(bc.lock);
537
538    for(ce=bc.normal.lru; ce; prev=ce, ce=ce->next) {
539        count++;
540    }
541
542    for(ce=bc.locked.lru; ce; prev=ce, ce=ce->next) {
543        count++;
544    }
545
546    if (count != bc.cur_blocks) {
547        if (count < bc.cur_blocks - 16)
548            panic("%s: count == %d, cur_blocks %d, prev 0x%x\n",
549                    str, count, bc.cur_blocks, prev);
550        else
551            printf("%s: count == %d, cur_blocks %d, prev 0x%x\n",
552                    str, count, bc.cur_blocks, prev);
553    }
554
555    UNLOCK(bc.lock);
556}
557
558
559static void
560dump_lists(void)
561{
562    cache_ent *nce;
563
564    printf("LOCKED 0x%x  (tail 0x%x, head 0x%x)\n", &bc.locked,
565           bc.locked.lru, bc.locked.mru);
566    for(nce=bc.locked.lru; nce; nce=nce->next)
567        printf("nce @ 0x%x dev %d bnum %ld flags %d lock %d clone 0x%x func 0x%x\n",
568               nce, nce->dev, nce->block_num, nce->flags, nce->lock, nce->clone,
569               nce->func);
570
571    printf("NORMAL 0x%x  (tail 0x%x, head 0x%x)\n", &bc.normal,
572           bc.normal.lru, bc.normal.mru);
573    for(nce=bc.normal.lru; nce; nce=nce->next)
574        printf("nce @ 0x%x dev %d bnum %ld flags %d lock %d clone 0x%x func 0x%x\n",
575               nce, nce->dev, nce->block_num, nce->flags, nce->lock, nce->clone,
576               nce->func);
577}
578
579
580static void
581check_lists(void)
582{
583    cache_ent *ce, *prev, *oce;
584    cache_ent_list *cel;
585
586    cel = &bc.normal;
587    for(ce=cel->lru,prev=NULL; ce; prev=ce, ce=ce->next) {
588        for(oce=bc.locked.lru; oce; oce=oce->next) {
589            if (oce == ce) {
590                dump_lists();
591                panic("1:ce @ 0x%x is in two lists(cel 0x%x &LOCKED)\n",ce,cel);
592            }
593        }
594    }
595    if (prev && prev != cel->mru) {
596        dump_lists();
597        panic("*** last element in list != cel mru (ce 0x%x, cel 0x%x)\n",
598              prev, cel);
599    }
600
601    cel = &bc.locked;
602    for(ce=cel->lru,prev=NULL; ce; prev=ce, ce=ce->next) {
603        for(oce=bc.normal.lru; oce; oce=oce->next) {
604            if (oce == ce) {
605                dump_lists();
606                panic("3:ce @ 0x%x is in two lists(cel 0x%x & DIRTY)\n",ce,cel);
607            }
608        }
609    }
610    if (prev && prev != cel->mru) {
611        dump_lists();
612        panic("*** last element in list != cel mru (ce 0x%x, cel 0x%x)\n",
613              prev, cel);
614    }
615}
616#endif
617
618
619#ifdef DEBUG
620#if 0
621static int
622do_dump(int argc, char **argv)
623{
624    dump_cache_list();
625    return 1;
626}
627
628
629static int
630do_find_block(int argc, char **argv)
631{
632    int        i;
633    fs_off_t  bnum;
634    cache_ent *ce;
635
636    if (argc < 2) {
637        kprintf("%s: needs a block # argument\n", argv[0]);
638        return 1;
639    }
640
641    for(i=1; i < argc; i++) {
642        bnum = strtoul(argv[i], NULL, 0);
643
644        for(ce=bc.normal.lru; ce; ce=ce->next) {
645            if (ce->block_num == bnum) {
646                kprintf("found clean bnum %ld @ 0x%lx (data @ 0x%lx)\n",
647                        bnum, ce, ce->data);
648            }
649        }
650
651        for(ce=bc.locked.lru; ce; ce=ce->next) {
652            if (ce->block_num == bnum) {
653                kprintf("found locked bnum %ld @ 0x%lx (data @ 0x%lx)\n",
654                        bnum, ce, ce->data);
655            }
656        }
657    }
658
659    return 0;
660}
661
662static int
663do_find_data(int argc, char **argv)
664{
665    int        i;
666    void      *data;
667    cache_ent *ce;
668
669    if (argc < 2) {
670        kprintf("%s: needs a block # argument\n", argv[0]);
671        return 1;
672    }
673
674    for(i=1; i < argc; i++) {
675        data = (void *)strtoul(argv[i], NULL, 0);
676
677        for(ce=bc.normal.lru; ce; ce=ce->next) {
678            if (ce->data == data) {
679                kprintf("found normal data ptr for bnum %ld @ ce 0x%lx\n",
680                        ce->block_num, ce);
681            }
682        }
683
684        for(ce=bc.locked.lru; ce; ce=ce->next) {
685            if (ce->data == data) {
686                kprintf("found locked data ptr for bnum %ld @ ce 0x%lx\n",
687                        ce->block_num, ce);
688            }
689        }
690    }
691
692    return 0;
693}
694#endif
695#endif /* DEBUG */
696
697
698
699/*
700  this function detaches the cache_ent from the list.
701*/
702static void
703delete_from_list(cache_ent_list *cel, cache_ent *ce)
704{
705    if (ce->next)
706        ce->next->prev = ce->prev;
707    if (ce->prev)
708        ce->prev->next = ce->next;
709
710    if (cel->lru == ce)
711        cel->lru = ce->next;
712    if (cel->mru == ce)
713        cel->mru = ce->prev;
714
715    ce->next = NULL;
716    ce->prev = NULL;
717}
718
719
720
721/*
722  this function adds the cache_ent ce to the head of the
723  list (i.e. the MRU end).  the cache_ent should *not*
724  be in any lists.
725*/
726static void
727add_to_head(cache_ent_list *cel, cache_ent *ce)
728{
729if (ce->next != NULL || ce->prev != NULL) {
730    panic("*** ath: ce has non-null next/prev ptr (ce 0x%x nxt 0x%x, prv 0x%x)\n",
731           ce, ce->next, ce->prev);
732}
733
734    ce->next = NULL;
735    ce->prev = cel->mru;
736
737    if (cel->mru)
738        cel->mru->next = ce;
739    cel->mru = ce;
740
741    if (cel->lru == NULL)
742        cel->lru = ce;
743}
744
745
746/*
747  this function adds the cache_ent ce to the tail of the
748  list (i.e. the MRU end).  the cache_ent should *not*
749  be in any lists.
750*/
751static void
752add_to_tail(cache_ent_list *cel, cache_ent *ce)
753{
754if (ce->next != NULL || ce->prev != NULL) {
755    panic("*** att: ce has non-null next/prev ptr (ce 0x%x nxt 0x%x, prv 0x%x)\n",
756           ce, ce->next, ce->prev);
757}
758
759    ce->next = cel->lru;
760    ce->prev = NULL;
761
762    if (cel->lru)
763        cel->lru->prev = ce;
764    cel->lru = ce;
765
766    if (cel->mru == NULL)
767        cel->mru = ce;
768}
769
770
771static int
772cache_ent_cmp(const void *a, const void *b)
773{
774    fs_off_t  diff;
775    cache_ent *p1 = *(cache_ent **)a, *p2 = *(cache_ent **)b;
776
777    if (p1 == NULL || p2 == NULL)
778        panic("cache_ent pointers are null?!? (a 0x%lx, b 0x%lx\n)\n", a, b);
779
780    if (p1->dev == p2->dev) {
781        diff = p1->block_num - p2->block_num;
782        return (int)diff;
783    } else {
784        return p1->dev - p2->dev;
785    }
786}
787
788#if 0
789// ToDo: add this to the fsh (as a background thread)?
790static void
791cache_flusher(void *arg, int phase)
792{
793    int    i, num_ents, err;
794    bigtime_t now = system_time();
795    static cache_ent *ce = NULL;
796    static cache_ent *ents[NUM_FLUSH_BLOCKS];
797
798    /*
799       if someone else was in the cache recently then just bail out so
800       we don't lock them out unnecessarily
801    */
802    if ((now - last_cache_access) < 1000000)
803        return;
804
805    LOCK(bc.lock);
806
807    ce = bc.normal.lru;
808
809    for(num_ents=0; ce && num_ents < NUM_FLUSH_BLOCKS; ce=ce->next) {
810        if (ce->flags & CE_BUSY)
811            continue;
812
813        if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL)
814            continue;
815
816        ents[num_ents] = ce;
817        ents[num_ents]->flags |= CE_BUSY;
818        num_ents++;
819    }
820
821    /* if we've got some room left over, look for cloned locked blocks */
822    if (num_ents < NUM_FLUSH_BLOCKS) {
823        ce = bc.locked.lru;
824
825        for(; num_ents < NUM_FLUSH_BLOCKS;) {
826            for(;
827                ce && ((ce->flags & CE_BUSY) || ce->clone == NULL);
828                ce=ce->next)
829                /* skip ents that meet the above criteria */;
830
831            if (ce == NULL)
832                break;
833
834            ents[num_ents] = ce;
835            ents[num_ents]->flags |= CE_BUSY;
836            ce = ce->next;
837            num_ents++;
838        }
839    }
840
841    UNLOCK(bc.lock);
842
843    if (num_ents == 0)
844        return;
845
846    qsort(ents, num_ents, sizeof(cache_ent **), cache_ent_cmp);
847
848    if ((err = flush_ents(ents, num_ents)) != 0) {
849        printf("flush ents failed (ents @ 0x%lx, num_ents %d!\n",
850               (ulong)ents, num_ents);
851    }
852
853    for(i=0; i < num_ents; i++) {       /* clear the busy bit on each of ent */
854        ents[i]->flags &= ~CE_BUSY;
855    }
856}
857#endif
858
859
860static int
861flush_cache_ent(cache_ent *ce)
862{
863    int   ret = 0;
864    void *data;
865
866    /* if true, then there's nothing to flush */
867    if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL)
868        return 0;
869
870    /* same thing here */
871    if (ce->clone == NULL && ce->lock != 0)
872        return 0;
873
874 restart:
875    if (ce->clone)
876        data = ce->clone;
877    else
878        data = ce->data;
879
880	if (chatty_io > 2) printf("flush: %7Ld\n", ce->block_num);
881	ret = write_phys_blocks(ce->dev, ce->block_num, data, 1, ce->bsize);
882
883    if (ce->func) {
884        ce->func(ce->logged_bnum, 1, ce->arg);
885        ce->func = NULL;
886    }
887
888    if (ce->clone) {
889        free(ce->clone);
890        ce->clone = NULL;
891
892        if (ce->lock == 0 && (ce->flags & CE_DIRTY))
893            goto restart;     /* also write the real data ptr */
894    } else {
895        ce->flags &= ~CE_DIRTY;
896    }
897
898    return ret;
899}
900
901
902static int
903flush_ents(cache_ent **ents, int n_ents)
904{
905    int    i, j, k, ret = 0, bsize, iocnt, do_again = 0;
906    fs_off_t  start_bnum;
907    struct iovec *iov;
908
909    iov = get_iovec_array();
910    if (iov == NULL)
911        return ENOMEM;
912
913restart:
914    for(i=0; i < n_ents; i++) {
915        /* if true, then there's nothing to flush */
916        if ((ents[i]->flags & CE_DIRTY) == 0 && ents[i]->clone == NULL)
917            continue;
918
919        /* if true we can't touch the dirty data yet because it's locked */
920        if (ents[i]->clone == NULL && ents[i]->lock != 0)
921            continue;
922
923
924        bsize      = ents[i]->bsize;
925        start_bnum = ents[i]->block_num;
926
927        for(j=i+1; j < n_ents && (j - i) < NUM_FLUSH_BLOCKS; j++) {
928            if (ents[j]->dev != ents[i]->dev ||
929                ents[j]->block_num != start_bnum + (j - i))
930                break;
931
932            if (ents[j]->clone == NULL && ents[j]->lock != 0)
933                break;
934        }
935
936        if (j == i+1) {           /* only one block, just flush it directly */
937            if ((ret = flush_cache_ent(ents[i])) != 0)
938                break;
939            continue;
940        }
941
942
943        for(k=i,iocnt=0; k < j; k++,iocnt++) {
944            if (ents[k]->clone)
945                iov[iocnt].iov_base = ents[k]->clone;
946            else
947                iov[iocnt].iov_base = ents[k]->data;
948
949            iov[iocnt].iov_len = bsize;
950        }
951
952		if (chatty_io)
953	        printf("writev @ %Ld for %d blocks\n", start_bnum, iocnt);
954
955        ret = writev_pos(ents[i]->dev, start_bnum * (fs_off_t)bsize,
956                         &iov[0], iocnt);
957        if (ret != iocnt*bsize) {
958            int idx;
959
960            printf("flush_ents: writev failed: iocnt %d start bnum %Ld "
961                   "bsize %d, ret %d\n", iocnt, start_bnum, bsize, ret);
962
963            for(idx=0; idx < iocnt; idx++)
964                printf("iov[%2d] = %p :: %ld\n", idx, iov[idx].iov_base,
965                       iov[idx].iov_len);
966
967            printf("error %s writing blocks %Ld:%d (%d != %d)\n",
968                   strerror(errno), start_bnum, iocnt, ret, iocnt*bsize);
969            ret = EINVAL;
970            break;
971        }
972        ret = 0;
973
974
975        for(k=i; k < j; k++) {
976            if (ents[k]->func) {
977                ents[k]->func(ents[k]->logged_bnum, 1, ents[k]->arg);
978                ents[k]->func = NULL;
979            }
980
981            if (ents[k]->clone) {
982                free(ents[k]->clone);
983                ents[k]->clone = NULL;
984            } else {
985                ents[k]->flags &= ~CE_DIRTY;
986            }
987        }
988
989
990        i = j - 1;  /* i gets incremented by the outer for loop */
991    }
992
993    /*
994       here we have to go back through and flush any blocks that are
995       still dirty.  with an arched brow you astutely ask, "but how
996       could this happen given the above loop?"  Ahhh young grasshopper
997       I say, the path through the cache is long and twisty and fraught
998       with peril.  The reason it can happen is that a block can be both
999       cloned and dirty.  The above loop would only flush the cloned half
1000       of the data, not the main dirty block.  So we have to go back
1001       through and see if there are any blocks that are still dirty.  If
1002       there are we go back to the top of the function and do the whole
1003       thing over.  Kind of grody but it is necessary to insure the
1004       correctness of the log for the Be file system.
1005    */
1006    if (do_again == 0) {
1007        for(i=0; i < n_ents; i++) {
1008            if ((ents[i]->flags & CE_DIRTY) == 0 || ents[i]->lock)
1009                continue;
1010
1011            do_again = 1;
1012            break;
1013        }
1014
1015        if (do_again)
1016            goto restart;
1017    }
1018
1019    release_iovec_array(iov);
1020
1021
1022    return ret;
1023}
1024
1025static void
1026delete_cache_list(cache_ent_list *cel)
1027{
1028    void      *junk;
1029    cache_ent *ce, *next;
1030
1031    for (ce = cel->lru; ce; ce = next) {
1032        next = ce->next;
1033        if (ce->lock != 0) {
1034            if (ce->func)
1035                printf("*** shutdown_block_cache: block %Ld, lock == %d "
1036                       "(arg %p)!\n", ce->block_num, ce->lock, ce->arg);
1037            else
1038                printf("*** shutdown_block_cache: block %Ld, lock == %d!\n",
1039                       ce->block_num, ce->lock);
1040        }
1041
1042        if (ce->flags & CE_BUSY) {
1043            printf("* shutdown block cache: bnum %Ld is busy? ce %p\n",
1044                   ce->block_num, ce);
1045        }
1046
1047        if ((ce->flags & CE_DIRTY) || ce->clone) {
1048            flush_cache_ent(ce);
1049        }
1050
1051        if (ce->clone)
1052            free(ce->clone);
1053        ce->clone = NULL;
1054
1055        if (ce->data)
1056            free(ce->data);
1057        ce->data = NULL;
1058
1059        if ((junk = hash_delete(&bc.ht, ce->dev, ce->block_num)) != ce) {
1060            printf("*** free_device_cache: bad hash table entry %Ld "
1061                   "%p != %p\n", ce->block_num, junk, ce);
1062        }
1063
1064        memset(ce, 0xfd, sizeof(*ce));
1065        free(ce);
1066
1067        bc.cur_blocks--;
1068    }
1069}
1070
1071
1072void
1073shutdown_block_cache(void)
1074{
1075    /* print_hash_stats(&bc.ht); */
1076
1077    if (bc.lock.s > 0)
1078        LOCK(bc.lock);
1079
1080#ifndef USER
1081    unregister_kernel_daemon(cache_flusher, NULL);
1082#endif
1083
1084    delete_cache_list(&bc.normal);
1085    delete_cache_list(&bc.locked);
1086
1087    bc.normal.lru = bc.normal.mru = NULL;
1088    bc.locked.lru = bc.locked.mru = NULL;
1089
1090    shutdown_hash_table(&bc.ht);
1091
1092    if (bc.lock.s > 0)
1093        free_lock(&bc.lock);
1094    bc.lock.s = -1;
1095
1096    if (iovec_lock.s >= 0)
1097        free_lock(&iovec_lock);
1098}
1099
1100
1101
1102int
1103init_cache_for_device(int fd, fs_off_t max_blocks)
1104{
1105    int ret = 0;
1106
1107    if (fd >= MAX_DEVICES)
1108        return -1;
1109
1110    LOCK(bc.lock);
1111
1112    if (max_device_blocks[fd] != 0) {
1113        printf("device %d is already initialized!\n", fd);
1114        ret = -1;
1115    } else {
1116        max_device_blocks[fd] = max_blocks;
1117    }
1118
1119    UNLOCK(bc.lock);
1120
1121    return ret;
1122}
1123
1124
1125
1126/*
1127   this routine assumes that bc.lock has been acquired
1128*/
1129static cache_ent *
1130block_lookup(int dev, fs_off_t bnum)
1131{
1132    int        count = 0;
1133    cache_ent *ce;
1134
1135    while (1) {
1136        ce = hash_lookup(&bc.ht, dev, bnum);
1137        if (ce == NULL)
1138            return NULL;
1139
1140        if ((ce->flags & CE_BUSY) == 0) /* it's ok, break out and return it */
1141            break;
1142
1143        /* else, it's busy and we need to retry our lookup */
1144        UNLOCK(bc.lock);
1145
1146        snooze(5000);
1147        if (count++ == 5000) {  /* then a lot of time has elapsed */
1148            printf("block %Ld isn't coming un-busy (ce @ %p)\n",
1149                   ce->block_num, ce);
1150        }
1151
1152        LOCK(bc.lock);
1153    }
1154
1155    if (ce->flags & CE_BUSY)
1156        panic("block lookup: returning a busy block @ 0x%lx?!?\n",(ulong)ce);
1157
1158    return ce;
1159}
1160
1161
1162
1163int
1164set_blocks_info(int dev, fs_off_t *blocks, int nblocks,
1165               void (*func)(fs_off_t bnum, size_t nblocks, void *arg), void *arg)
1166{
1167    int        i, j, cur;
1168    cache_ent *ce;
1169    cache_ent *ents[NUM_FLUSH_BLOCKS];
1170
1171    LOCK(bc.lock);
1172
1173
1174    for(i=0, cur=0; i < nblocks; i++) {
1175
1176        /* printf("sbi:   %ld (arg 0x%x)\n", blocks[i], arg); */
1177        ce = block_lookup(dev, blocks[i]);
1178        if (ce == NULL) {
1179            panic("*** set_block_info can't find bnum %ld!\n", blocks[i]);
1180            UNLOCK(bc.lock);
1181            return ENOENT;   /* hopefully this doesn't happen... */
1182        }
1183
1184
1185        if (blocks[i] != ce->block_num || dev != ce->dev) {
1186            UNLOCK(bc.lock);
1187            panic("** error1: looked up dev %d block %ld but found dev %d "
1188                    "bnum %ld\n", dev, blocks[i], ce->dev, ce->block_num);
1189            return EBADF;
1190        }
1191
1192        if (ce->lock == 0) {
1193            panic("* set_block_info on bnum %ld (%d) but it's not locked!\n",
1194                    blocks[i], nblocks);
1195        }
1196
1197
1198        if ((ce->flags & CE_DIRTY) == 0) {
1199            panic("*** set_block_info on non-dirty block bnum %ld (%d)!\n",
1200                    blocks[i], nblocks);
1201        }
1202
1203        ce->flags |= CE_BUSY;     /* mark all blocks as busy till we're done */
1204
1205        /* if there is cloned data, it needs to be flushed now */
1206        if (ce->clone && ce->func) {
1207            ents[cur++] = ce;
1208
1209            if (cur >= NUM_FLUSH_BLOCKS) {
1210                UNLOCK(bc.lock);
1211
1212                qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1213
1214                flush_ents(ents, cur);
1215
1216                LOCK(bc.lock);
1217                for(j=0; j < cur; j++)
1218                    ents[j]->flags &= ~CE_BUSY;
1219                cur = 0;
1220            }
1221        }
1222    }
1223
1224
1225    if (cur != 0) {
1226        UNLOCK(bc.lock);
1227
1228        qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1229
1230        flush_ents(ents, cur);
1231
1232        LOCK(bc.lock);
1233        for(j=0; j < cur; j++)
1234            ents[j]->flags &= ~CE_BUSY;
1235        cur = 0;
1236    }
1237
1238
1239    /* now go through and set the info that we were asked to */
1240    for (i = 0; i < nblocks; i++) {
1241        /* we can call hash_lookup() here because we know it's around */
1242        ce = hash_lookup(&bc.ht, dev, blocks[i]);
1243        if (ce == NULL) {
1244            panic("*** set_block_info can't find bnum %Ld!\n", blocks[i]);
1245            UNLOCK(bc.lock);
1246            return ENOENT;   /* hopefully this doesn't happen... */
1247        }
1248
1249        ce->flags &= ~(CE_DIRTY | CE_BUSY);
1250
1251        if (ce->func != NULL) {
1252            panic("*** set_block_info non-null callback on bnum %Ld\n",
1253                    ce->block_num);
1254        }
1255
1256        if (ce->clone != NULL) {
1257            panic("*** ce->clone == %p, not NULL in set_block_info\n",
1258                    ce->clone);
1259        }
1260
1261        ce->clone = (void *)malloc(ce->bsize);
1262        if (ce->clone == NULL)
1263            panic("*** can't clone bnum %Ld (bsize %d)\n",
1264                    ce->block_num, ce->bsize);
1265
1266
1267        memcpy(ce->clone, ce->data, ce->bsize);
1268
1269        ce->func   = func;
1270        ce->arg    = arg;
1271
1272        ce->logged_bnum = blocks[i];
1273
1274        ce->lock--;
1275        if (ce->lock < 0) {
1276            printf("sbi: whoa nellie! ce @ %p (%Ld) has lock == %d\n",
1277                   ce, ce->block_num, ce->lock);
1278        }
1279
1280        if (ce->lock == 0) {
1281            delete_from_list(&bc.locked, ce);
1282            add_to_head(&bc.normal, ce);
1283        }
1284    }
1285
1286    UNLOCK(bc.lock);
1287
1288    return 0;
1289}
1290
1291
1292/* this function is only for use by flush_device() */
1293static void
1294do_flush(cache_ent **ents, int max)
1295{
1296    int i;
1297
1298    for(i=0; i < max; i++) {
1299        ents[i]->flags |= CE_BUSY;
1300    }
1301
1302    UNLOCK(bc.lock);
1303
1304    qsort(ents, max, sizeof(cache_ent **), cache_ent_cmp);
1305    flush_ents(ents, max);
1306
1307    LOCK(bc.lock);
1308    for(i=0; i < max; i++) {
1309        ents[i]->flags &= ~CE_BUSY;
1310    }
1311}
1312
1313int
1314flush_device(int dev, int warn_locked)
1315{
1316    int cur;
1317    cache_ent *ce;
1318    cache_ent *ents[NUM_FLUSH_BLOCKS];
1319
1320    LOCK(bc.lock);
1321
1322    cur = 0;
1323    ce = bc.normal.lru;
1324    while (ce) {
1325        if (ce->dev != dev || (ce->flags & CE_BUSY)) {
1326            ce = ce->next;
1327            continue;
1328        }
1329
1330        if ((ce->flags & CE_DIRTY) || ce->clone) {
1331            ents[cur++] = ce;
1332            if (cur >= NUM_FLUSH_BLOCKS) {
1333                do_flush(ents, cur);
1334
1335                ce = bc.normal.lru;
1336                cur = 0;
1337                continue;
1338            }
1339        }
1340
1341        ce = ce->next;
1342    }
1343
1344    if (cur != 0)
1345        do_flush(ents, cur);
1346
1347    cur = 0;
1348    ce = bc.locked.lru;
1349    while (ce) {
1350        if (ce->dev != dev || (ce->flags & CE_BUSY)) {
1351            ce = ce->next;
1352            continue;
1353        }
1354
1355        if (ce->clone) {
1356            ents[cur++] = ce;
1357            if (cur >= NUM_FLUSH_BLOCKS) {
1358                do_flush(ents, cur);
1359
1360                ce = bc.locked.lru;
1361                cur = 0;
1362                continue;
1363            }
1364        }
1365
1366        ce = ce->next;
1367    }
1368
1369    if (cur != 0)
1370        do_flush(ents, cur);
1371
1372    UNLOCK(bc.lock);
1373
1374    return 0;
1375}
1376
1377
1378static void
1379real_remove_cached_blocks(int dev, int allow_writes, cache_ent_list *cel)
1380{
1381    void      *junk;
1382    cache_ent *ce, *next = NULL;
1383
1384    for(ce=cel->lru; ce; ce=next) {
1385        next = ce->next;
1386
1387        if (ce->dev != dev) {
1388            continue;
1389        }
1390
1391        if (ce->lock != 0 || (ce->flags & CE_BUSY)) {
1392            printf("*** remove_cached_dev: block %Ld has lock = %d, flags "
1393                   "0x%x! ce @ 0x%lx\n", ce->block_num, ce->lock, ce->flags,
1394                   (ulong)ce);
1395        }
1396
1397        if (allow_writes == ALLOW_WRITES &&
1398            ((ce->flags & CE_DIRTY) || ce->clone)) {
1399            ce->flags |= CE_BUSY;
1400            flush_cache_ent(ce);
1401            ce->flags &= ~CE_BUSY;
1402        }
1403
1404        /* unlink this guy */
1405        if (cel->lru == ce)
1406            cel->lru = ce->next;
1407
1408        if (cel->mru == ce)
1409            cel->mru = ce->prev;
1410
1411        if (ce->prev)
1412            ce->prev->next = ce->next;
1413        if (ce->next)
1414            ce->next->prev = ce->prev;
1415
1416        if (ce->clone)
1417            free(ce->clone);
1418        ce->clone = NULL;
1419
1420        if (ce->data)
1421            free(ce->data);
1422        ce->data = NULL;
1423
1424        if ((junk = hash_delete(&bc.ht, ce->dev, ce->block_num)) != ce) {
1425            panic("*** remove_cached_device: bad hash table entry %ld "
1426                   "0x%lx != 0x%lx\n", ce->block_num, (ulong)junk, (ulong)ce);
1427        }
1428
1429        free(ce);
1430
1431        bc.cur_blocks--;
1432    }
1433
1434}
1435
1436int
1437remove_cached_device_blocks(int dev, int allow_writes)
1438{
1439    LOCK(bc.lock);
1440
1441    real_remove_cached_blocks(dev, allow_writes, &bc.normal);
1442    real_remove_cached_blocks(dev, allow_writes, &bc.locked);
1443
1444    max_device_blocks[dev] = 0;
1445
1446    UNLOCK(bc.lock);
1447
1448    return 0;
1449}
1450
1451
1452
1453int
1454flush_blocks(int dev, fs_off_t bnum, int nblocks)
1455{
1456    int        cur, i;
1457    cache_ent *ce;
1458    cache_ent *ents[NUM_FLUSH_BLOCKS];
1459
1460    if (nblocks == 0)   /* might as well check for this */
1461        return 0;
1462
1463    LOCK(bc.lock);
1464
1465    cur = 0;
1466    for(; nblocks > 0; nblocks--, bnum++) {
1467        ce = block_lookup(dev, bnum);
1468        if (ce == NULL)
1469            continue;
1470
1471        if (bnum != ce->block_num || dev != ce->dev) {
1472            UNLOCK(bc.lock);
1473            panic("error2: looked up dev %d block %ld but found %d %ld\n",
1474                  dev, bnum, ce->dev, ce->block_num);
1475            return EBADF;
1476        }
1477
1478        if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL)
1479            continue;
1480
1481        ce->flags |= CE_BUSY;
1482        ents[cur++] = ce;
1483
1484        if (cur >= NUM_FLUSH_BLOCKS) {
1485            UNLOCK(bc.lock);
1486
1487            qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1488            flush_ents(ents, cur);
1489
1490            LOCK(bc.lock);
1491            for(i=0; i < cur; i++) {
1492                ents[i]->flags &= ~CE_BUSY;
1493            }
1494            cur = 0;
1495        }
1496    }
1497
1498    UNLOCK(bc.lock);
1499
1500    if (cur == 0)     /* nothing more to do */
1501        return 0;
1502
1503    /* flush out the last few buggers */
1504    qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1505    flush_ents(ents, cur);
1506
1507    for(i=0; i < cur; i++) {
1508        ents[i]->flags &= ~CE_BUSY;
1509    }
1510
1511    return 0;
1512}
1513
1514
1515int
1516mark_blocks_dirty(int dev, fs_off_t bnum, int nblocks)
1517{
1518    int        ret = 0;
1519    cache_ent *ce;
1520
1521    LOCK(bc.lock);
1522
1523    while(nblocks > 0) {
1524        ce = block_lookup(dev, bnum);
1525        if (ce) {
1526            ce->flags |= CE_DIRTY;
1527            bnum      += 1;
1528            nblocks   -= 1;
1529        } else {     /* hmmm, that's odd, didn't find it */
1530            printf("** mark_blocks_diry couldn't find block %Ld (len %d)\n",
1531                   bnum, nblocks);
1532            ret = ENOENT;
1533            break;
1534        }
1535    }
1536
1537    UNLOCK(bc.lock);
1538
1539    return ret;
1540}
1541
1542
1543
1544int
1545release_block(int dev, fs_off_t bnum)
1546{
1547    cache_ent *ce;
1548
1549    /* printf("rlsb: %ld\n", bnum); */
1550    LOCK(bc.lock);
1551
1552    ce = block_lookup(dev, bnum);
1553    if (ce) {
1554        if (bnum != ce->block_num || dev != ce->dev) {
1555            panic("*** error3: looked up dev %d block %ld but found %d %ld\n",
1556                    dev, bnum, ce->dev, ce->block_num);
1557            UNLOCK(bc.lock);
1558            return EBADF;
1559        }
1560
1561        ce->lock--;
1562
1563        if (ce->lock < 0) {
1564            printf("rlsb: whoa nellie! ce %Ld has lock == %d\n",
1565                   ce->block_num, ce->lock);
1566        }
1567
1568        if (ce->lock == 0) {
1569            delete_from_list(&bc.locked, ce);
1570            add_to_head(&bc.normal, ce);
1571        }
1572
1573    } else {     /* hmmm, that's odd, didn't find it */
1574        panic("** release_block asked to find %ld but it's not here\n",
1575               bnum);
1576    }
1577
1578    UNLOCK(bc.lock);
1579
1580    return 0;
1581}
1582
1583
1584static cache_ent *
1585new_cache_ent(int bsize)
1586{
1587    cache_ent *ce;
1588
1589    ce = (cache_ent *)calloc(1, sizeof(cache_ent));
1590    if (ce == NULL) {
1591        panic("*** error: cache can't allocate memory!\n");
1592        return NULL;
1593    }
1594
1595    ce->data = malloc(bsize);
1596    if (ce->data == NULL) {
1597        free(ce);
1598        panic("** error cache can't allocate data memory\n");
1599        UNLOCK(bc.lock);
1600        return NULL;
1601    }
1602
1603    ce->dev       = -1;
1604    ce->block_num = -1;
1605
1606    return ce;
1607}
1608
1609
1610static void
1611get_ents(cache_ent **ents, int num_needed, int max, int *num_gotten, int bsize)
1612{
1613    int        cur, retry_counter = 0, max_retry = num_needed * 256;
1614    cache_ent *ce;
1615
1616    if (num_needed > max)
1617        panic("get_ents: num_needed %d but max %d (doh!)\n", num_needed, max);
1618
1619    /* if the cache isn't full yet, just allocate the blocks */
1620    for(cur=0; bc.cur_blocks < bc.max_blocks && cur < num_needed; cur++) {
1621        ents[cur] = new_cache_ent(bsize);
1622        if (ents[cur] == NULL)
1623            break;
1624        bc.cur_blocks++;
1625    }
1626
1627    /* pluck off blocks from the LRU end of the normal list, keep trying too */
1628    while(cur < num_needed && retry_counter < max_retry) {
1629        for(ce=bc.normal.lru; ce && cur < num_needed; ce=ce->next) {
1630            if (ce->lock)
1631                panic("get_ents: normal list has locked blocks (ce 0x%x)\n",ce);
1632
1633            if (ce->flags & CE_BUSY)   /* don't touch busy blocks */
1634                continue;
1635
1636            ce->flags   |= CE_BUSY;
1637            ents[cur++]  = ce;
1638        }
1639
1640        if (cur < num_needed) {
1641            UNLOCK(bc.lock);
1642            snooze(10000);
1643            LOCK(bc.lock);
1644            retry_counter++;
1645        }
1646    }
1647
1648    if (cur < num_needed && retry_counter >= max_retry) {  /* oh shit! */
1649        dump_cache_list();
1650        UNLOCK(bc.lock);
1651        panic("get_ents: waited too long; can't get enough ce's (c %d n %d)\n",
1652              cur, num_needed);
1653    }
1654
1655    /*
1656      If the last block is a dirty one, try to get more of 'em so
1657      that we can flush a bunch of blocks at once.
1658    */
1659    if (cur && cur < max &&
1660        ((ents[cur-1]->flags & CE_DIRTY) || ents[cur-1]->clone)) {
1661
1662        for(ce=ents[cur-1]->next; ce && cur < max; ce=ce->next) {
1663            if (ce->flags & CE_BUSY)   /* don't touch busy blocks */
1664                continue;
1665
1666            if (ce->lock)
1667                panic("get_ents:2 dirty list has locked blocks (ce 0x%x)\n",ce);
1668
1669            ce->flags   |= CE_BUSY;
1670            ents[cur++]  = ce;
1671        }
1672    }
1673
1674    *num_gotten = cur;
1675}
1676
1677
1678static int
1679read_into_ents(int dev, fs_off_t bnum, cache_ent **ents, int num, int bsize)
1680{
1681    int    i, ret;
1682    struct iovec *iov;
1683
1684    iov = get_iovec_array();
1685
1686    for (i = 0; i < num; i++) {
1687        iov[i].iov_base = ents[i]->data;
1688        iov[i].iov_len  = bsize;
1689    }
1690
1691	if (chatty_io > 2)
1692		printf("readv @ %Ld for %d blocks (at %Ld, block_size = %d)\n", bnum, num, bnum*bsize, bsize);
1693    ret = readv_pos(dev, bnum*bsize, iov, num);
1694
1695    release_iovec_array(iov);
1696
1697    if (ret != num*bsize) {
1698        printf("read_into_ents: asked to read %d bytes but got %d\n",
1699               num*bsize, ret);
1700        printf("*** iov @ %p (num %d)\n", iov, num);
1701        return EINVAL;
1702    } else
1703        return 0;
1704}
1705
1706
1707
1708#define CACHE_READ          0x0001
1709#define CACHE_WRITE         0x0002
1710#define CACHE_NOOP          0x0004     /* for getting empty blocks */
1711#define CACHE_LOCKED        0x0008
1712#define CACHE_READ_AHEAD_OK 0x0010     /* it's ok to do read-ahead */
1713
1714
1715static char *
1716op_to_str(int op)
1717{
1718    static char buff[128];
1719
1720    if (op & CACHE_READ)
1721        strcpy(buff, "READ");
1722    else if (op & CACHE_WRITE)
1723        strcpy(buff, "WRITE");
1724    else if (op & CACHE_NOOP)
1725        strcpy(buff, "NOP");
1726
1727    if (op & CACHE_LOCKED)
1728        strcat(buff, " LOCKED");
1729
1730    if (op & CACHE_READ_AHEAD_OK)
1731        strcat(buff, " (AHEAD)");
1732
1733    return buff;
1734}
1735
1736static int
1737cache_block_io(int dev, fs_off_t bnum, void *data, fs_off_t num_blocks, int bsize,
1738               int op, void **dataptr)
1739{
1740    size_t          err = 0;
1741    cache_ent      *ce;
1742    cache_ent_list *cel;
1743
1744    if (chatty_io > 1)
1745        printf("cbio: bnum = %Ld, num_blocks = %Ld, bsize = %d, op = %s\n", bnum, num_blocks,
1746               bsize, op_to_str(op));
1747
1748    /* some sanity checks first */
1749    if (bsize == 0)
1750        panic("cache_io: block size == 0 for bnum %Ld?!?\n", bnum);
1751
1752    if (num_blocks == 0)
1753        panic("cache_io: bnum %Ld has num_blocks == 0!\n", bnum);
1754
1755    if (data == NULL && dataptr == NULL) {
1756        printf("major butthead move: null data and dataptr! bnum %Ld:%Ld\n",
1757                bnum, num_blocks);
1758        return ENOMEM;
1759    }
1760
1761    if (data == NULL) {
1762        if (num_blocks != 1)    /* get_block() should never do that */
1763            panic("cache_io: num_blocks %Ld but should be 1\n",
1764                  num_blocks);
1765
1766        if (op & CACHE_WRITE)
1767            panic("cache_io: get_block() asked to write?!?\n");
1768    }
1769
1770    if (bnum + num_blocks > max_device_blocks[dev]) {
1771        printf("dev %d: access to blocks %Ld:%Ld but max_dev_blocks is %Ld\n",
1772               dev, bnum, num_blocks, max_device_blocks[dev]);
1773
1774		// let the app crash here
1775		*(int *)0x3100 = 0xc0debabe;
1776        return EINVAL;
1777    }
1778
1779    last_cache_access = system_time();
1780
1781    /* if the i/o is greater than 64k, do it directly */
1782    if (num_blocks * bsize >= 64 * 1024) {
1783        char  *ptr;
1784        fs_off_t  tmp;
1785
1786        if (data == NULL || (op & CACHE_LOCKED)) {
1787            panic("*** asked to do a large locked io that's too hard!\n");
1788        }
1789
1790
1791        if (op & CACHE_READ) {
1792            if (read_phys_blocks(dev, bnum, data, num_blocks, bsize) != 0) {
1793                printf("cache read:read_phys_blocks failed (%s on blocks %Ld:%Ld)!\n",
1794                        strerror(errno), bnum, num_blocks);
1795                return EINVAL;
1796            }
1797
1798            LOCK(bc.lock);
1799
1800            /* if any of the blocks are in the cache, grab them instead */
1801            ptr = data;
1802            for(tmp=bnum; tmp < bnum+num_blocks; tmp++, ptr+=bsize) {
1803                ce = block_lookup(dev, tmp);
1804                /*
1805                    if we find a block in the cache we have to copy its
1806                    data just in case it is more recent than what we just
1807                    read from disk (which could happen if someone wrote
1808                    these blocks after we did the read but before we locked
1809                    the cache and entered this loop).
1810                */
1811                if (ce) {
1812                    if (tmp != ce->block_num || dev != ce->dev) {
1813                        UNLOCK(bc.lock);
1814                        panic("*** error4: looked up dev %d block %Ld but "
1815                                "found %d %Ld\n", dev, tmp, ce->dev,
1816                                ce->block_num);
1817                    }
1818
1819                    memcpy(ptr, ce->data, bsize);
1820                }
1821            }
1822
1823            UNLOCK(bc.lock);
1824        } else if (op & CACHE_WRITE) {
1825            LOCK(bc.lock);
1826
1827            /* if any of the blocks are in the cache, update them too */
1828            ptr = data;
1829            for(tmp=bnum; tmp < bnum+num_blocks; tmp++, ptr+=bsize) {
1830                ce = block_lookup(dev, tmp);
1831                if (ce) {
1832                    if (tmp != ce->block_num || dev != ce->dev) {
1833                        UNLOCK(bc.lock);
1834                        panic("*** error5: looked up dev %d block %Ld but "
1835                                "found %d %Ld\n", dev, tmp, ce->dev,
1836                                ce->block_num);
1837                        return EBADF;
1838                    }
1839
1840                    /* XXXdbg -- this isn't strictly necessary */
1841                    if (ce->clone) {
1842                        printf("over-writing cloned data (ce %p bnum %Ld)...\n", ce, tmp);
1843                        flush_cache_ent(ce);
1844                    }
1845
1846                    /* copy the data into the cache */
1847                    memcpy(ce->data, ptr, bsize);
1848                }
1849            }
1850
1851            UNLOCK(bc.lock);
1852
1853            if (write_phys_blocks(dev, bnum, data, num_blocks, bsize) != 0) {
1854                printf("cache write: write_phys_blocks failed (%s on blocks "
1855                       "%Ld:%Ld)!\n", strerror(errno), bnum, num_blocks);
1856                return EINVAL;
1857            }
1858        } else {
1859            printf("bad cache op %d (bnum %Ld nblocks %Ld)\n", op, bnum, num_blocks);
1860            return EINVAL;
1861        }
1862
1863        return 0;
1864    }
1865
1866
1867    LOCK(bc.lock);
1868    while(num_blocks) {
1869
1870        ce = block_lookup(dev, bnum);
1871        if (ce) {
1872            if (bnum != ce->block_num || dev != ce->dev) {
1873                UNLOCK(bc.lock);
1874                panic("*** error6: looked up dev %d block %ld but found "
1875                        "%d %ld\n", dev, bnum, ce->dev, ce->block_num);
1876                return EBADF;
1877            }
1878
1879            if (bsize != ce->bsize) {
1880                panic("*** requested bsize %d but ce->bsize %d ce @ 0x%x\n",
1881                        bsize, ce->bsize, ce);
1882            }
1883
1884            /* delete this ent from the list it is in because it may change */
1885            if (ce->lock)
1886                cel = &bc.locked;
1887            else
1888                cel = &bc.normal;
1889
1890            delete_from_list(cel, ce);
1891
1892            if (op & CACHE_READ) {
1893                if (data && data != ce->data) {
1894                    memcpy(data, ce->data, bsize);
1895                } else if (dataptr) {
1896                    *dataptr = ce->data;
1897                } else {
1898                    printf("cbio:data %p dptr %p ce @ %p ce->data %p\n",
1899                           data, dataptr, ce, ce->data);
1900                }
1901            } else if (op & CACHE_WRITE) {
1902                if (data && data != ce->data)
1903                    memcpy(ce->data, data, bsize);
1904
1905                ce->flags |= CE_DIRTY;
1906            } else if (op & CACHE_NOOP) {
1907                memset(ce->data, 0, bsize);
1908                if (data)
1909                    memset(data, 0, bsize);
1910
1911                if (dataptr)
1912                    *dataptr = ce->data;
1913
1914                ce->flags |= CE_DIRTY;
1915            } else {
1916                panic("cached_block_io: bogus op %d\n", op);
1917            }
1918
1919            if (op & CACHE_LOCKED)
1920                ce->lock++;
1921
1922            if (ce->lock)
1923                cel = &bc.locked;
1924            else
1925                cel = &bc.normal;
1926
1927            /* now put this ent at the head of the appropriate list */
1928            add_to_head(cel, ce);
1929
1930            if (data != NULL)
1931                data = (void *)((char *)data + bsize);
1932
1933            bnum       += 1;
1934            num_blocks -= 1;
1935
1936            continue;
1937        } else {                                  /* it's not in the cache */
1938            int        cur, cur_nblocks, num_dirty, real_nblocks, num_needed;
1939            cache_ent *ents[NUM_FLUSH_BLOCKS];
1940
1941            /*
1942               here we find out how many additional blocks in this request
1943               are not in the cache.  the idea is that then we can do one
1944               big i/o on that many blocks at once.
1945            */
1946            for(cur_nblocks=1;
1947                cur_nblocks < num_blocks && cur_nblocks < NUM_FLUSH_BLOCKS;
1948                cur_nblocks++) {
1949
1950                /* we can call hash_lookup() directly instead of
1951                   block_lookup() because we don't care about the
1952                   state of the busy bit of the block at this point
1953                */
1954                if (hash_lookup(&bc.ht, dev, bnum + cur_nblocks))
1955                    break;
1956            }
1957
1958            /*
1959              here we try to figure out how many extra blocks we should read
1960              for read-ahead.  we want to read as many as possible that are
1961              not already in the cache and that don't cause us to try and
1962              read beyond the end of the disk.
1963            */
1964            if ((op & CACHE_READ) && (op & CACHE_READ_AHEAD_OK) &&
1965                (cur_nblocks * bsize) < read_ahead_size) {
1966
1967                for(num_needed=cur_nblocks;
1968                    num_needed < (read_ahead_size / bsize);
1969                    num_needed++) {
1970
1971                    if ((bnum + num_needed) >= max_device_blocks[dev])
1972                        break;
1973
1974                    if (hash_lookup(&bc.ht, dev, bnum + num_needed))
1975                        break;
1976                }
1977            } else {
1978                num_needed = cur_nblocks;
1979            }
1980
1981            /* this will get us pointers to a bunch of cache_ents we can use */
1982            get_ents(ents, num_needed, NUM_FLUSH_BLOCKS, &real_nblocks, bsize);
1983
1984            if (real_nblocks < num_needed) {
1985                panic("don't have enough cache ents (need %d got %d %ld::%d)\n",
1986                      num_needed, real_nblocks, bnum, num_blocks);
1987            }
1988
1989            /*
1990              There are now three variables used as limits within the ents
1991              array.  This is how they are related:
1992
1993                 cur_nblocks <= num_needed <= real_nblocks
1994
1995              Ents from 0 to cur_nblocks-1 are going to be used to fulfill
1996              this IO request.  Ents from cur_nblocks to num_needed-1 are
1997              for read-ahead.  Ents from num_needed to real_nblocks are
1998              extra blocks that get_ents() asked us to flush.  Often (and
1999              always on writes) cur_nblocks == num_needed.
2000
2001              Below, we sort the list of ents so that when we flush them
2002              they go out in order.
2003            */
2004
2005            qsort(ents, real_nblocks, sizeof(cache_ent **), cache_ent_cmp);
2006
2007            /*
2008              delete each ent from its list because it will change.  also
2009              count up how many dirty blocks there are and insert into the
2010              hash table any new blocks so that no one else will try to
2011              read them in when we release the cache semaphore to do our I/O.
2012            */
2013            for(cur=0,num_dirty=0; cur < real_nblocks; cur++) {
2014                ce = ents[cur];
2015                ce->flags |= CE_BUSY;
2016
2017                /*
2018                   insert the new block into the hash table with its new block
2019                   number. note that the block is still in the hash table for
2020                   its old block number -- and it has to be until we are done
2021                   flushing it from the cache (to prevent someone else from
2022                   sneaking in in front of us and trying to read the same
2023                   block that we're flushing).
2024                */
2025                if (cur < num_needed) {
2026                    if (hash_insert(&bc.ht, dev, bnum + cur, ce) != 0)
2027                        panic("could not insert cache ent for %d %ld (0x%lx)\n",
2028                              dev, bnum + cur, (ulong)ents[cur]);
2029                }
2030
2031                if (ce->dev == -1)
2032                    continue;
2033
2034                if ((ce->flags & CE_DIRTY) || ce->clone)
2035                    num_dirty++;
2036
2037                if (ce->lock)
2038                    panic("cbio: can't use locked blocks here ce @ 0x%x\n",ce);
2039                else
2040                    cel = &bc.normal;
2041
2042                delete_from_list(cel, ce);
2043            }
2044            ce = NULL;
2045
2046
2047            /*
2048               we release the block cache semaphore here so that we can
2049               go do all the i/o we need to do (flushing dirty blocks
2050               that we're kicking out as well as reading any new data).
2051
2052               because all the blocks we're touching are marked busy
2053               no one else should mess with them while we're doing this.
2054            */
2055            if (num_dirty || (op & CACHE_READ)) {
2056                UNLOCK(bc.lock);
2057
2058                /* this flushes any blocks we're kicking out that are dirty */
2059                if (num_dirty && (err = flush_ents(ents, real_nblocks)) != 0) {
2060                    printf("flush ents failed (ents @ 0x%lx, nblocks %d!\n",
2061                           (ulong)ents, cur_nblocks);
2062                    goto handle_err;
2063                }
2064
2065            }
2066
2067            /*
2068               now that everything is flushed to disk, go through and
2069               make sure that the data blocks we're going to use are
2070               the right block size for this current request (it's
2071               possible we're kicking out some smaller blocks and need
2072               to reallocate the data block pointer). We do this in two
2073               steps, first free'ing everything and then going through
2074               and doing the malloc's to try and be nice to the memory
2075               system (i.e. allow it to coalesce stuff, etc).
2076            */
2077            err = 0;
2078            for(cur=0; cur < num_needed; cur++) {
2079                if (ents[cur]->bsize != bsize) {
2080                    free(ents[cur]->data);
2081                    ents[cur]->data = NULL;
2082
2083                    if (ents[cur]->clone) {
2084                        free(ents[cur]->clone);
2085                        ents[cur]->clone = NULL;
2086                    }
2087                }
2088            }
2089
2090            for(cur=0; cur < num_needed; cur++) {
2091                if (ents[cur]->data == NULL) {
2092                    ents[cur]->data  = (void *)malloc(bsize);
2093                    ents[cur]->bsize = bsize;
2094                }
2095
2096                if (ents[cur]->data == NULL) {
2097                    printf("cache: no memory for block (bsize %d)!\n",
2098                           bsize);
2099                    err = ENOMEM;
2100                    break;
2101                }
2102            }
2103
2104            /*
2105               if this condition is true it's a pretty serious error.
2106               we'll try and back out gracefully but we're in pretty
2107               deep at this point and it ain't going to be easy.
2108            */
2109  handle_err:
2110            if (err) {
2111                for(cur=0; cur < num_needed; cur++) {
2112                    cache_ent *tmp_ce;
2113
2114                    tmp_ce = (cache_ent *)hash_delete(&bc.ht,dev,bnum+cur);
2115                    if (tmp_ce != ents[cur]) {
2116                        panic("hash_del0: %d %ld got 0x%lx, not 0x%lx\n",
2117                                dev, bnum+cur, (ulong)tmp_ce,
2118                                (ulong)ents[cur]);
2119                    }
2120
2121                    tmp_ce = (cache_ent *)hash_delete(&bc.ht,ents[cur]->dev,
2122                                                        ents[cur]->block_num);
2123                    if (tmp_ce != ents[cur]) {
2124                        panic("hash_del1: %d %ld got 0x%lx, not 0x%lx\n",
2125                                ents[cur]->dev, ents[cur]->block_num, (ulong)tmp_ce,
2126                                (ulong)ents[cur]);
2127                    }
2128
2129                    ents[cur]->flags &= ~CE_BUSY;
2130                    if (ents[cur]->data)
2131                        free(ents[cur]->data);
2132                    free(ents[cur]);
2133                    ents[cur] = NULL;
2134
2135                    bc.cur_blocks--;
2136                }
2137
2138                if (cur < real_nblocks) {
2139                    LOCK(bc.lock);
2140                    for(; cur < real_nblocks; cur++) {
2141                        ents[cur]->flags &= ~CE_BUSY;
2142
2143                        /* we have to put them back here */
2144                        add_to_tail(&bc.normal, ents[cur]);
2145                    }
2146                    UNLOCK(bc.lock);
2147                }
2148
2149                return ENOMEM;
2150            }
2151
2152
2153            /*
2154               If we go into this if statement, the block cache lock
2155               has *already been released* up above when we flushed the
2156               dirty entries.  As always, since the blocks we're mucking
2157               with are marked busy, they shouldn't get messed with.
2158            */
2159            err = 0;
2160            if (num_dirty || (op & CACHE_READ)) {
2161                /* this section performs the i/o that we need to do */
2162                if (op & CACHE_READ) {
2163                    err = read_into_ents(dev, bnum, ents, num_needed, bsize);
2164                } else {
2165                    err = 0;
2166                }
2167
2168                if (err != 0) {
2169                    printf("err %s on dev %d block %Ld:%d (%d) "
2170                           "data %p, ents[0] %p\n",
2171                           strerror(errno), dev, bnum, cur_nblocks,
2172                           bsize, data, ents[0]);
2173                }
2174
2175                /*
2176                   acquire the semaphore here so that we can go on mucking
2177                   with the cache data structures.  We need to delete old
2178                   block numbers from the hash table and set the new block
2179                   number's for the blocks we just read in.  We also put the
2180                   read-ahead blocks at the head of mru list.
2181                */
2182
2183                LOCK(bc.lock);
2184            }
2185
2186            for(cur=0; cur < num_needed; cur++) {
2187                cache_ent *tmp_ce;
2188
2189                ce = ents[cur];
2190                if (ce->dev != -1) {
2191                    tmp_ce = hash_delete(&bc.ht, ce->dev, ce->block_num);
2192                    if (tmp_ce == NULL || tmp_ce != ce) {
2193                        panic("*** hash_delete failure (ce 0x%x tce 0x%x)\n",
2194                              ce, tmp_ce);
2195                    }
2196                }
2197
2198                if (err == 0 && cur >= cur_nblocks) {
2199                    ce->dev       = dev;
2200                    ce->block_num = bnum + cur;
2201                    ce->flags    &= ~CE_BUSY;
2202                    add_to_head(&bc.normal, ce);
2203                }
2204            }
2205            ce = NULL;
2206
2207            /*
2208              clear the busy bit on the blocks we force-flushed and
2209              put them on the normal list since they're now clean.
2210            */
2211            for(; cur < real_nblocks; cur++) {
2212                ents[cur]->flags &= ~CE_BUSY;
2213
2214                if (ents[cur]->lock)
2215                    panic("should not have locked blocks here (ce 0x%x)\n",
2216                          ents[cur]);
2217
2218                add_to_tail(&bc.normal, ents[cur]);
2219            }
2220
2221            if (err) {   /* then we have some cleanup to do */
2222                for(cur=0; cur < num_needed; cur++) {
2223                    cache_ent *tmp_ce;
2224
2225                    /* we delete all blocks from the cache so we don't
2226                       leave partially written blocks in the cache */
2227
2228                    tmp_ce = (cache_ent *)hash_delete(&bc.ht,dev,bnum+cur);
2229                    if (tmp_ce != ents[cur]) {
2230                        panic("hash_del: %d %ld got 0x%lx, not 0x%lx\n",
2231                                dev, bnum+cur, (ulong)tmp_ce,
2232                                (ulong)ents[cur]);
2233                    }
2234
2235                    ce = ents[cur];
2236                    ce->flags &= ~CE_BUSY;
2237
2238                    free(ce->data);
2239                    ce->data = NULL;
2240
2241                    free(ce);
2242                    ents[cur] = NULL;
2243
2244                    bc.cur_blocks--;
2245                }
2246                ce = NULL;
2247
2248                UNLOCK(bc.lock);
2249                return err;
2250            }
2251
2252
2253            /*
2254               last step: go through and make sure all the cache_ent
2255               structures have the right data in them, delete old guys, etc.
2256            */
2257            for(cur=0; cur < cur_nblocks; cur++) {
2258                ce = ents[cur];
2259
2260                if (ce->dev != -1) {   /* then clean this guy up */
2261                    if (ce->next || ce->prev)
2262                        panic("ce @ 0x%x should not be in a list yet!\n", ce);
2263
2264                    if (ce->clone)
2265                        free(ce->clone);
2266
2267                    if (ce->data == NULL)
2268                        panic("ce @ 0x%lx has a null data ptr\n", (ulong)ce);
2269                }
2270
2271                ce->dev        = dev;
2272                ce->block_num  = bnum + cur;
2273                ce->bsize      = bsize;
2274                ce->flags      = CE_NORMAL;
2275                ce->lock       = 0;
2276                ce->clone      = NULL;
2277                ce->func = ce->arg  = NULL;
2278                ce->next = ce->prev = NULL;
2279
2280                if (op & CACHE_READ) {
2281                    if (data)
2282                        memcpy(data, ce->data, bsize);
2283                } else if (op & CACHE_WRITE) {
2284                    ce->flags  |= CE_DIRTY;
2285                    memcpy(ce->data, data, bsize);
2286                } else if (op & CACHE_NOOP) {
2287                    memset(ce->data, 0, bsize);
2288                    if (data)
2289                        memset(data, 0, bsize);
2290
2291                    ce->flags |= CE_DIRTY;
2292                }
2293
2294                if (op & CACHE_LOCKED) {
2295                    ce->lock++;
2296                    cel = &bc.locked;
2297                } else {
2298                    cel = &bc.normal;
2299                }
2300
2301                /* now stick this puppy at the head of the mru list */
2302                add_to_head(cel, ce);
2303
2304
2305                if (dataptr) {
2306                    *dataptr = ce->data;
2307                }
2308
2309                if (data != NULL)
2310                    data = (void *)((char *)data + bsize);
2311                else if (cur_nblocks != 1)
2312                    panic("cache can't handle setting data_ptr twice!\n");
2313            }  /* end of for(cur=0; cur < cur_nblocks; cur++) */
2314
2315            bnum       += cur_nblocks;
2316            num_blocks -= cur_nblocks;
2317
2318        }   /* end of else it's not in the cache */
2319
2320    }   /* end of while(num_blocks) */
2321
2322    UNLOCK(bc.lock);
2323
2324    return 0;
2325}
2326
2327
2328void *
2329get_block(int dev, fs_off_t bnum, int bsize)
2330{
2331    void *data;
2332
2333    if (cache_block_io(dev, bnum, NULL, 1, bsize, CACHE_READ|CACHE_LOCKED|CACHE_READ_AHEAD_OK,
2334                       &data) != 0)
2335        return NULL;
2336
2337    return data;
2338}
2339
2340void *
2341get_empty_block(int dev, fs_off_t bnum, int bsize)
2342{
2343    void *data;
2344
2345    if (cache_block_io(dev, bnum, NULL, 1, bsize, CACHE_NOOP|CACHE_LOCKED,
2346                       &data) != 0)
2347        return NULL;
2348
2349    return data;
2350}
2351
2352int
2353cached_read(int dev, fs_off_t bnum, void *data, fs_off_t num_blocks, int bsize)
2354{
2355    return cache_block_io(dev, bnum, data, num_blocks, bsize,
2356                          CACHE_READ | CACHE_READ_AHEAD_OK, NULL);
2357}
2358
2359
2360int
2361cached_write(int dev, fs_off_t bnum, const void *data, fs_off_t num_blocks,int bsize)
2362{
2363    return cache_block_io(dev, bnum, (void *)data, num_blocks, bsize,
2364                          CACHE_WRITE, NULL);
2365}
2366
2367int
2368cached_write_locked(int dev, fs_off_t bnum, const void *data,
2369                    fs_off_t num_blocks, int bsize)
2370{
2371    return cache_block_io(dev, bnum, (void *)data, num_blocks, bsize,
2372                          CACHE_WRITE | CACHE_LOCKED, NULL);
2373}
2374
2375
2376void
2377force_cache_flush(int dev, int prefer_log_blocks)
2378{
2379    int        i, count = 0;
2380    cache_ent *ce;
2381    cache_ent *ents[NUM_FLUSH_BLOCKS];
2382
2383
2384    LOCK(bc.lock);
2385
2386    for(ce=bc.normal.lru; ce; ce=ce->next) {
2387        if ((ce->dev == dev) &&
2388            (ce->flags & CE_BUSY) == 0 &&
2389            ((ce->flags & CE_DIRTY) || ce->clone) &&
2390            ((prefer_log_blocks && ce->func) || (prefer_log_blocks == 0))) {
2391
2392            ce->flags |= CE_BUSY;
2393            ents[count++] = ce;
2394
2395            if (count >= NUM_FLUSH_BLOCKS) {
2396                break;
2397            }
2398        }
2399    }
2400
2401    /* if we've got some room left, try and grab any cloned blocks */
2402    if (count < NUM_FLUSH_BLOCKS) {
2403        for(ce=bc.locked.lru; ce; ce=ce->next) {
2404            if ((ce->dev == dev) &&
2405                (ce->flags & CE_BUSY) == 0 &&
2406                (ce->clone)) {
2407
2408                ce->flags |= CE_BUSY;
2409                ents[count++] = ce;
2410
2411                if (count >= NUM_FLUSH_BLOCKS) {
2412                    break;
2413                }
2414            }
2415        }
2416    }
2417
2418    UNLOCK(bc.lock);
2419
2420    if (count != 0) {
2421        qsort(ents, count, sizeof(cache_ent **), cache_ent_cmp);
2422        flush_ents(ents, count);
2423
2424        for(i=0; i < count; i++)
2425            ents[i]->flags &= ~CE_BUSY;
2426    }
2427}
2428