1/*
2   This file contains the global device cache for the BeOS.  All
3   file system I/O comes through here.  The cache can handle blocks
4   of different sizes for multiple different underlying physical
5   devices.
6
7   The cache is organized as a hash table (for lookups by device
8   and block number) and two doubly-linked lists.  The normal
9   list is for "normal" blocks which are either clean or dirty.
10   The locked list is for blocks that are locked in the cache by
11   BFS.  The lists are LRU ordered.
12
13   Most of the work happens in the function cache_block_io() which
14   is quite lengthy.  The other functions of interest are get_ents()
15   which picks victims to be kicked out of the cache; flush_ents()
16   which does the work of flushing them; and set_blocks_info() which
17   handles cloning blocks and setting callbacks so that the BFS
18   journal will work properly.  If you want to modify this code it
19   will take some study but it's not too bad.  Do not think about
20   separating the list of clean and dirty blocks into two lists as
21   I did that already and it's slower.
22
23   Originally this cache code was written while listening to the album
24   "Ride the Lightning" by Metallica.  The current version was written
25   while listening to "Welcome to SkyValley" by Kyuss as well as an
26   ambient collection on the German label FAX.  Music helps a lot when
27   writing code.
28
29   THIS CODE COPYRIGHT DOMINIC GIAMPAOLO.  NO WARRANTY IS EXPRESSED
30   OR IMPLIED.  YOU MAY USE THIS CODE AND FREELY DISTRIBUTE IT FOR
31   NON-COMMERCIAL USE AS LONG AS THIS NOTICE REMAINS ATTACHED.
32
33   FOR COMMERCIAL USE, CONTACT DOMINIC GIAMPAOLO (dbg@be.com).
34
35   Dominic Giampaolo
36   dbg@be.com
37*/
38
39#include "compat.h"
40
41#include <stdio.h>
42#include <stdlib.h>
43#include <memory.h>
44#include <string.h>
45#include <errno.h>
46#include <fcntl.h>
47#include <sys/types.h>
48#include <unistd.h>
49
50
51#if (defined(__BEOS__) || defined(__HAIKU__))
52#include <OS.h>
53#include <KernelExport.h>
54#endif
55
56#include "lock.h"
57#include "cache.h"
58
59
60
61#ifndef USER
62#define printf dprintf
63#endif
64#ifdef USER
65#define kprintf printf
66#endif
67
68
69/* forward prototypes */
70static int   flush_ents(cache_ent **ents, int n_ents);
71
72//static int   do_dump(int argc, char **argv);
73//static int   do_find_block(int argc, char **argv);
74//static int   do_find_data(int argc, char **argv);
75//static void  cache_flusher(void *arg, int phase);
76
77
78int chatty_io = 0;
79
80#define CHUNK (512 * 1024)   /* a hack to work around scsi driver bugs */
81
82size_t
83read_phys_blocks(int fd, fs_off_t bnum, void *data, uint num_blocks, int bsize)
84{
85    size_t ret = 0;
86    size_t sum;
87
88    if (chatty_io)
89        printf("R: %8Ld : %3d\n", bnum, num_blocks);
90
91    if (num_blocks * bsize < CHUNK)
92        ret = read_pos(fd, bnum * bsize, data, num_blocks * bsize);
93    else {
94        for(sum=0; (sum + CHUNK) <= (num_blocks * bsize); sum += CHUNK) {
95            ret = read_pos(fd, (bnum * bsize) + sum, data, CHUNK);
96            if (ret != CHUNK)
97                break;
98
99            data = (void *)((char *)data + CHUNK);
100        }
101
102        if (ret == CHUNK && ((num_blocks * bsize) - sum) > 0) {
103            ret = read_pos(fd, (bnum * bsize) + sum, data,
104                           (num_blocks * bsize) - sum);
105
106            if (ret == (num_blocks * bsize) - sum)
107                ret = num_blocks * bsize;
108        } else if (ret == CHUNK) {
109            ret = num_blocks * bsize;
110        }
111    }
112
113    if (ret == num_blocks * bsize)
114        return 0;
115    else
116        return FS_EBADF;
117}
118
119size_t
120write_phys_blocks(int fd, fs_off_t bnum, void *data, uint num_blocks, int bsize)
121{
122    size_t ret = 0;
123    size_t sum;
124
125    if (chatty_io)
126        printf("W: %8Ld : %3d\n", bnum, num_blocks);
127
128    if (num_blocks * bsize < CHUNK)
129        ret = write_pos(fd, bnum * bsize, data, num_blocks * bsize);
130    else {
131        for(sum=0; (sum + CHUNK) <= (num_blocks * bsize); sum += CHUNK) {
132            ret = write_pos(fd, (bnum * bsize) + sum, data, CHUNK);
133            if (ret != CHUNK)
134                break;
135
136            data = (void *)((char *)data + CHUNK);
137        }
138
139        if (ret == CHUNK && ((num_blocks * bsize) - sum) > 0) {
140            ret = write_pos(fd, (bnum * bsize) + sum, data,
141                            (num_blocks * bsize) - sum);
142
143            if (ret == (num_blocks * bsize) - sum)
144                ret = num_blocks * bsize;
145        } else if (ret == CHUNK) {
146            ret = num_blocks * bsize;
147        }
148    }
149
150
151    if (ret == num_blocks * bsize)
152        return 0;
153    else
154        return FS_EBADF;
155}
156
157//	#pragma mark -
158
159static int
160init_hash_table(hash_table *ht)
161{
162    ht->max   = HT_DEFAULT_MAX;
163    ht->mask  = ht->max - 1;
164    ht->num_elements = 0;
165
166    ht->table = (hash_ent **)calloc(ht->max, sizeof(hash_ent *));
167    if (ht->table == NULL)
168        return FS_ENOMEM;
169
170    return 0;
171}
172
173
174static void
175shutdown_hash_table(hash_table *ht)
176{
177    int       i, hash_len;
178    hash_ent *he, *next;
179
180    for(i=0; i < ht->max; i++) {
181        he = ht->table[i];
182
183        for(hash_len=0; he; hash_len++, he=next) {
184            next = he->next;
185            free(he);
186        }
187    }
188
189    if (ht->table)
190        free(ht->table);
191    ht->table = NULL;
192}
193
194#if 0
195static void
196print_hash_stats(hash_table *ht)
197{
198    int       i, hash_len, max = -1, sum = 0;
199    hash_ent *he, *next;
200
201    for(i=0; i < ht->max; i++) {
202        he = ht->table[i];
203
204        for(hash_len=0; he; hash_len++, he=next) {
205            next = he->next;
206        }
207        if (hash_len)
208            printf("bucket %3d : %3d\n", i, hash_len);
209
210        sum += hash_len;
211        if (hash_len > max)
212            max = hash_len;
213    }
214
215    printf("max # of chains: %d,  average chain length %d\n", max,sum/ht->max);
216}
217#endif
218
219#define HASH(d, b)   ((((fs_off_t)d) << (sizeof(fs_off_t)*8 - 6)) | (b))
220
221static hash_ent *
222new_hash_ent(int dev, fs_off_t bnum, void *data)
223{
224    hash_ent *he;
225
226    he = (hash_ent *)malloc(sizeof(*he));
227    if (he == NULL)
228        return NULL;
229
230    he->hash_val = HASH(dev, bnum);
231    he->dev      = dev;
232    he->bnum     = bnum;
233    he->data     = data;
234    he->next     = NULL;
235
236    return he;
237}
238
239
240static int
241grow_hash_table(hash_table *ht)
242{
243    int        i, omax, newsize, newmask;
244    fs_off_t      hash;
245    hash_ent **new_table, *he, *next;
246
247    if (ht->max & ht->mask) {
248        printf("*** hashtable size %d or mask %d looks weird!\n", ht->max,
249               ht->mask);
250    }
251
252    omax    = ht->max;
253    newsize = omax * 2;        /* have to grow in powers of two */
254    newmask = newsize - 1;
255
256    new_table = (hash_ent **)calloc(newsize, sizeof(hash_ent *));
257    if (new_table == NULL)
258        return FS_ENOMEM;
259
260    for(i=0; i < omax; i++) {
261        for(he=ht->table[i]; he; he=next) {
262            hash = he->hash_val & newmask;
263            next = he->next;
264
265            he->next        = new_table[hash];
266            new_table[hash] = he;
267        }
268    }
269
270    free(ht->table);
271    ht->table = new_table;
272    ht->max   = newsize;
273    ht->mask  = newmask;
274
275    return 0;
276}
277
278
279
280
281static int
282hash_insert(hash_table *ht, int dev, fs_off_t bnum, void *data)
283{
284    fs_off_t    hash;
285    hash_ent *he, *curr;
286
287    hash = HASH(dev, bnum) & ht->mask;
288
289    curr = ht->table[hash];
290    for(; curr != NULL; curr=curr->next)
291        if (curr->dev == dev && curr->bnum == bnum)
292            break;
293
294    if (curr && curr->dev == dev && curr->bnum == bnum) {
295        printf("entry %d:%Ld already in the hash table!\n", dev, bnum);
296        return FS_EEXIST;
297    }
298
299    he = new_hash_ent(dev, bnum, data);
300    if (he == NULL)
301        return FS_ENOMEM;
302
303    he->next        = ht->table[hash];
304    ht->table[hash] = he;
305
306    ht->num_elements++;
307    if (ht->num_elements >= ((ht->max * 3) / 4)) {
308        if (grow_hash_table(ht) != 0)
309            return FS_ENOMEM;
310    }
311
312    return 0;
313}
314
315static void *
316hash_lookup(hash_table *ht, int dev, fs_off_t bnum)
317{
318    hash_ent *he;
319
320    he = ht->table[HASH(dev, bnum) & ht->mask];
321
322    for(; he != NULL; he=he->next) {
323        if (he->dev == dev && he->bnum == bnum)
324            break;
325    }
326
327    if (he)
328        return he->data;
329    else
330        return NULL;
331}
332
333
334static void *
335hash_delete(hash_table *ht, int dev, fs_off_t bnum)
336{
337    void     *data;
338    fs_off_t     hash;
339    hash_ent *he, *prev = NULL;
340
341    hash = HASH(dev, bnum) & ht->mask;
342    he = ht->table[hash];
343
344    for(; he != NULL; prev=he,he=he->next) {
345        if (he->dev == dev && he->bnum == bnum)
346            break;
347    }
348
349    if (he == NULL) {
350        printf("*** hash_delete: tried to delete non-existent block %d:%Ld\n",
351               dev, bnum);
352        return NULL;
353    }
354
355    data = he->data;
356
357    if (ht->table[hash] == he)
358        ht->table[hash] = he->next;
359    else if (prev)
360        prev->next = he->next;
361    else
362        panic("hash table is inconsistent\n");
363
364    free(he);
365    ht->num_elements--;
366
367    return data;
368}
369
370//	#pragma mark -
371
372/*
373  These are the global variables for the cache.
374*/
375static block_cache  bc;
376
377#define       MAX_IOVECS  64           /* # of iovecs for use by cache code */
378static lock   iovec_lock;
379static struct iovec *iovec_pool[MAX_IOVECS];  /* each ptr is to an array of iovecs */
380static int    iovec_used[MAX_IOVECS];  /* non-zero == iovec is in use */
381
382#define NUM_FLUSH_BLOCKS 64    /* size of the iovec array pointed by each ptr */
383
384
385#define DEFAULT_READ_AHEAD_SIZE  (32 * 1024)
386static int read_ahead_size = DEFAULT_READ_AHEAD_SIZE;
387
388/* this array stores the size of each device so we can error check requests */
389#define MAX_DEVICES  256
390fs_off_t max_device_blocks[MAX_DEVICES];
391
392
393/* has the time of the last cache access so cache flushing doesn't interfere */
394static bigtime_t last_cache_access = 0;
395
396
397int
398init_block_cache(int max_blocks, int flags)
399{
400    memset(&bc, 0, sizeof(bc));
401    memset(iovec_pool, 0, sizeof(iovec_pool));
402    memset(iovec_used, 0, sizeof(iovec_used));
403    memset(&max_device_blocks, 0, sizeof(max_device_blocks));
404
405    if (init_hash_table(&bc.ht) != 0)
406        return FS_ENOMEM;
407
408    bc.lock.s = iovec_lock.s = -1;
409
410    bc.max_blocks = max_blocks;
411    bc.flags      = flags;
412    if (new_lock(&bc.lock, "bollockcache") != 0)
413        goto err;
414
415    if (new_lock(&iovec_lock, "iovec_lock") != 0)
416        goto err;
417
418    /* allocate two of these up front so vm won't accidently re-enter itself */
419    iovec_pool[0] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS);
420    iovec_pool[1] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS);
421
422#ifndef USER
423#ifdef DEBUG
424    add_debugger_command("bcache", do_dump, "dump the block cache list");
425    add_debugger_command("fblock", do_find_block, "find a block in the cache");
426    add_debugger_command("fdata",  do_find_data, "find a data block ptr in the cache");
427#endif
428    register_kernel_daemon(cache_flusher, NULL, 3);
429#endif
430
431    return 0;
432
433 err:
434    if (bc.lock.s >= 0)
435        free_lock(&bc.lock);
436
437    if (iovec_lock.s >= 0)
438        free_lock(&iovec_lock);
439
440    shutdown_hash_table(&bc.ht);
441    memset((void *)&bc, 0, sizeof(bc));
442    return FS_ENOMEM;
443}
444
445
446static struct iovec *
447get_iovec_array(void)
448{
449    int i;
450    struct iovec *iov;
451
452    LOCK(iovec_lock);
453
454    for(i=0; i < MAX_IOVECS; i++) {
455        if (iovec_used[i] == 0)
456            break;
457    }
458
459    if (i >= MAX_IOVECS)       /* uh-oh */
460        panic("cache: ran out of iovecs (pool 0x%x, used 0x%x)!\n",
461              &iovec_pool[0], &iovec_used[0]);
462
463    if (iovec_pool[i] == NULL) {
464        iovec_pool[i] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS);
465        if (iovec_pool == NULL)
466            panic("can't allocate an iovec!\n");
467    }
468
469    iov = iovec_pool[i];
470    iovec_used[i] = 1;
471
472    UNLOCK(iovec_lock);
473
474    return iov;
475}
476
477
478static void
479release_iovec_array(struct iovec *iov)
480{
481    int i;
482
483    LOCK(iovec_lock);
484
485    for(i=0; i < MAX_IOVECS; i++) {
486        if (iov == iovec_pool[i])
487            break;
488    }
489
490    if (i < MAX_IOVECS)
491        iovec_used[i] = 0;
492    else                     /* uh-oh */
493        printf("cache: released an iovec I don't own (iov %p)\n", iov);
494
495
496    UNLOCK(iovec_lock);
497}
498
499
500
501
502static void
503real_dump_cache_list(cache_ent_list *cel)
504{
505    cache_ent *ce;
506
507    kprintf("starting from LRU end:\n");
508
509    for (ce = cel->lru; ce; ce = ce->next) {
510        kprintf("ce %p dev %2d bnum %6Ld lock %d flag %d arg %p "
511               "clone %p\n", ce, ce->dev, ce->block_num, ce->lock,
512               ce->flags, ce->arg, ce->clone);
513    }
514    kprintf("MRU end\n");
515}
516
517static void
518dump_cache_list(void)
519{
520    kprintf("NORMAL BLOCKS\n");
521    real_dump_cache_list(&bc.normal);
522
523    kprintf("LOCKED BLOCKS\n");
524    real_dump_cache_list(&bc.locked);
525
526    kprintf("cur blocks %d, max blocks %d ht @ %p\n", bc.cur_blocks,
527           bc.max_blocks, &bc.ht);
528}
529
530#if 0
531static void
532check_bcache(char *str)
533{
534    int count = 0;
535    cache_ent *ce, *prev = NULL;
536
537    LOCK(bc.lock);
538
539    for(ce=bc.normal.lru; ce; prev=ce, ce=ce->next) {
540        count++;
541    }
542
543    for(ce=bc.locked.lru; ce; prev=ce, ce=ce->next) {
544        count++;
545    }
546
547    if (count != bc.cur_blocks) {
548        if (count < bc.cur_blocks - 16)
549            panic("%s: count == %d, cur_blocks %d, prev 0x%x\n",
550                    str, count, bc.cur_blocks, prev);
551        else
552            printf("%s: count == %d, cur_blocks %d, prev 0x%x\n",
553                    str, count, bc.cur_blocks, prev);
554    }
555
556    UNLOCK(bc.lock);
557}
558
559
560static void
561dump_lists(void)
562{
563    cache_ent *nce;
564
565    printf("LOCKED 0x%x  (tail 0x%x, head 0x%x)\n", &bc.locked,
566           bc.locked.lru, bc.locked.mru);
567    for(nce=bc.locked.lru; nce; nce=nce->next)
568        printf("nce @ 0x%x dev %d bnum %ld flags %d lock %d clone 0x%x func 0x%x\n",
569               nce, nce->dev, nce->block_num, nce->flags, nce->lock, nce->clone,
570               nce->func);
571
572    printf("NORMAL 0x%x  (tail 0x%x, head 0x%x)\n", &bc.normal,
573           bc.normal.lru, bc.normal.mru);
574    for(nce=bc.normal.lru; nce; nce=nce->next)
575        printf("nce @ 0x%x dev %d bnum %ld flags %d lock %d clone 0x%x func 0x%x\n",
576               nce, nce->dev, nce->block_num, nce->flags, nce->lock, nce->clone,
577               nce->func);
578}
579
580
581static void
582check_lists(void)
583{
584    cache_ent *ce, *prev, *oce;
585    cache_ent_list *cel;
586
587    cel = &bc.normal;
588    for(ce=cel->lru,prev=NULL; ce; prev=ce, ce=ce->next) {
589        for(oce=bc.locked.lru; oce; oce=oce->next) {
590            if (oce == ce) {
591                dump_lists();
592                panic("1:ce @ 0x%x is in two lists(cel 0x%x &LOCKED)\n",ce,cel);
593            }
594        }
595    }
596    if (prev && prev != cel->mru) {
597        dump_lists();
598        panic("*** last element in list != cel mru (ce 0x%x, cel 0x%x)\n",
599              prev, cel);
600    }
601
602    cel = &bc.locked;
603    for(ce=cel->lru,prev=NULL; ce; prev=ce, ce=ce->next) {
604        for(oce=bc.normal.lru; oce; oce=oce->next) {
605            if (oce == ce) {
606                dump_lists();
607                panic("3:ce @ 0x%x is in two lists(cel 0x%x & DIRTY)\n",ce,cel);
608            }
609        }
610    }
611    if (prev && prev != cel->mru) {
612        dump_lists();
613        panic("*** last element in list != cel mru (ce 0x%x, cel 0x%x)\n",
614              prev, cel);
615    }
616}
617#endif
618
619
620#ifdef DEBUG
621#if 0
622static int
623do_dump(int argc, char **argv)
624{
625    dump_cache_list();
626    return 1;
627}
628
629
630static int
631do_find_block(int argc, char **argv)
632{
633    int        i;
634    fs_off_t  bnum;
635    cache_ent *ce;
636
637    if (argc < 2) {
638        kprintf("%s: needs a block # argument\n", argv[0]);
639        return 1;
640    }
641
642    for(i=1; i < argc; i++) {
643        bnum = strtoul(argv[i], NULL, 0);
644
645        for(ce=bc.normal.lru; ce; ce=ce->next) {
646            if (ce->block_num == bnum) {
647                kprintf("found clean bnum %ld @ 0x%lx (data @ 0x%lx)\n",
648                        bnum, ce, ce->data);
649            }
650        }
651
652        for(ce=bc.locked.lru; ce; ce=ce->next) {
653            if (ce->block_num == bnum) {
654                kprintf("found locked bnum %ld @ 0x%lx (data @ 0x%lx)\n",
655                        bnum, ce, ce->data);
656            }
657        }
658    }
659
660    return 0;
661}
662
663static int
664do_find_data(int argc, char **argv)
665{
666    int        i;
667    void      *data;
668    cache_ent *ce;
669
670    if (argc < 2) {
671        kprintf("%s: needs a block # argument\n", argv[0]);
672        return 1;
673    }
674
675    for(i=1; i < argc; i++) {
676        data = (void *)strtoul(argv[i], NULL, 0);
677
678        for(ce=bc.normal.lru; ce; ce=ce->next) {
679            if (ce->data == data) {
680                kprintf("found normal data ptr for bnum %ld @ ce 0x%lx\n",
681                        ce->block_num, ce);
682            }
683        }
684
685        for(ce=bc.locked.lru; ce; ce=ce->next) {
686            if (ce->data == data) {
687                kprintf("found locked data ptr for bnum %ld @ ce 0x%lx\n",
688                        ce->block_num, ce);
689            }
690        }
691    }
692
693    return 0;
694}
695#endif
696#endif /* DEBUG */
697
698
699
700/*
701  this function detaches the cache_ent from the list.
702*/
703static void
704delete_from_list(cache_ent_list *cel, cache_ent *ce)
705{
706    if (ce->next)
707        ce->next->prev = ce->prev;
708    if (ce->prev)
709        ce->prev->next = ce->next;
710
711    if (cel->lru == ce)
712        cel->lru = ce->next;
713    if (cel->mru == ce)
714        cel->mru = ce->prev;
715
716    ce->next = NULL;
717    ce->prev = NULL;
718}
719
720
721
722/*
723  this function adds the cache_ent ce to the head of the
724  list (i.e. the MRU end).  the cache_ent should *not*
725  be in any lists.
726*/
727static void
728add_to_head(cache_ent_list *cel, cache_ent *ce)
729{
730if (ce->next != NULL || ce->prev != NULL) {
731    panic("*** ath: ce has non-null next/prev ptr (ce 0x%x nxt 0x%x, prv 0x%x)\n",
732           ce, ce->next, ce->prev);
733}
734
735    ce->next = NULL;
736    ce->prev = cel->mru;
737
738    if (cel->mru)
739        cel->mru->next = ce;
740    cel->mru = ce;
741
742    if (cel->lru == NULL)
743        cel->lru = ce;
744}
745
746
747/*
748  this function adds the cache_ent ce to the tail of the
749  list (i.e. the MRU end).  the cache_ent should *not*
750  be in any lists.
751*/
752static void
753add_to_tail(cache_ent_list *cel, cache_ent *ce)
754{
755if (ce->next != NULL || ce->prev != NULL) {
756    panic("*** att: ce has non-null next/prev ptr (ce 0x%x nxt 0x%x, prv 0x%x)\n",
757           ce, ce->next, ce->prev);
758}
759
760    ce->next = cel->lru;
761    ce->prev = NULL;
762
763    if (cel->lru)
764        cel->lru->prev = ce;
765    cel->lru = ce;
766
767    if (cel->mru == NULL)
768        cel->mru = ce;
769}
770
771
772static int
773cache_ent_cmp(const void *a, const void *b)
774{
775    fs_off_t  diff;
776    cache_ent *p1 = *(cache_ent **)a, *p2 = *(cache_ent **)b;
777
778    if (p1 == NULL || p2 == NULL)
779        panic("cache_ent pointers are null?!? (a 0x%lx, b 0x%lx\n)\n", a, b);
780
781    if (p1->dev == p2->dev) {
782        diff = p1->block_num - p2->block_num;
783        return (int)diff;
784    } else {
785        return p1->dev - p2->dev;
786    }
787}
788
789#if 0
790// ToDo: add this to the fsh (as a background thread)?
791static void
792cache_flusher(void *arg, int phase)
793{
794    int    i, num_ents, err;
795    bigtime_t now = system_time();
796    static cache_ent *ce = NULL;
797    static cache_ent *ents[NUM_FLUSH_BLOCKS];
798
799    /*
800       if someone else was in the cache recently then just bail out so
801       we don't lock them out unnecessarily
802    */
803    if ((now - last_cache_access) < 1000000)
804        return;
805
806    LOCK(bc.lock);
807
808    ce = bc.normal.lru;
809
810    for(num_ents=0; ce && num_ents < NUM_FLUSH_BLOCKS; ce=ce->next) {
811        if (ce->flags & CE_BUSY)
812            continue;
813
814        if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL)
815            continue;
816
817        ents[num_ents] = ce;
818        ents[num_ents]->flags |= CE_BUSY;
819        num_ents++;
820    }
821
822    /* if we've got some room left over, look for cloned locked blocks */
823    if (num_ents < NUM_FLUSH_BLOCKS) {
824        ce = bc.locked.lru;
825
826        for(; num_ents < NUM_FLUSH_BLOCKS;) {
827            for(;
828                ce && ((ce->flags & CE_BUSY) || ce->clone == NULL);
829                ce=ce->next)
830                /* skip ents that meet the above criteria */;
831
832            if (ce == NULL)
833                break;
834
835            ents[num_ents] = ce;
836            ents[num_ents]->flags |= CE_BUSY;
837            ce = ce->next;
838            num_ents++;
839        }
840    }
841
842    UNLOCK(bc.lock);
843
844    if (num_ents == 0)
845        return;
846
847    qsort(ents, num_ents, sizeof(cache_ent **), cache_ent_cmp);
848
849    if ((err = flush_ents(ents, num_ents)) != 0) {
850        printf("flush ents failed (ents @ 0x%lx, num_ents %d!\n",
851               (ulong)ents, num_ents);
852    }
853
854    for(i=0; i < num_ents; i++) {       /* clear the busy bit on each of ent */
855        ents[i]->flags &= ~CE_BUSY;
856    }
857}
858#endif
859
860
861static int
862flush_cache_ent(cache_ent *ce)
863{
864    int   ret = 0;
865    void *data;
866
867    /* if true, then there's nothing to flush */
868    if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL)
869        return 0;
870
871    /* same thing here */
872    if (ce->clone == NULL && ce->lock != 0)
873        return 0;
874
875 restart:
876    if (ce->clone)
877        data = ce->clone;
878    else
879        data = ce->data;
880
881	if (chatty_io > 2) printf("flush: %7Ld\n", ce->block_num);
882	ret = write_phys_blocks(ce->dev, ce->block_num, data, 1, ce->bsize);
883
884    if (ce->func) {
885        ce->func(ce->logged_bnum, 1, ce->arg);
886        ce->func = NULL;
887    }
888
889    if (ce->clone) {
890        free(ce->clone);
891        ce->clone = NULL;
892
893        if (ce->lock == 0 && (ce->flags & CE_DIRTY))
894            goto restart;     /* also write the real data ptr */
895    } else {
896        ce->flags &= ~CE_DIRTY;
897    }
898
899    return ret;
900}
901
902
903static int
904flush_ents(cache_ent **ents, int n_ents)
905{
906    int    i, j, k, ret = 0, bsize, iocnt, do_again = 0;
907    fs_off_t  start_bnum;
908    struct iovec *iov;
909
910    iov = get_iovec_array();
911    if (iov == NULL)
912        return FS_ENOMEM;
913
914restart:
915    for(i=0; i < n_ents; i++) {
916        /* if true, then there's nothing to flush */
917        if ((ents[i]->flags & CE_DIRTY) == 0 && ents[i]->clone == NULL)
918            continue;
919
920        /* if true we can't touch the dirty data yet because it's locked */
921        if (ents[i]->clone == NULL && ents[i]->lock != 0)
922            continue;
923
924
925        bsize      = ents[i]->bsize;
926        start_bnum = ents[i]->block_num;
927
928        for(j=i+1; j < n_ents && (j - i) < NUM_FLUSH_BLOCKS; j++) {
929            if (ents[j]->dev != ents[i]->dev ||
930                ents[j]->block_num != start_bnum + (j - i))
931                break;
932
933            if (ents[j]->clone == NULL && ents[j]->lock != 0)
934                break;
935        }
936
937        if (j == i+1) {           /* only one block, just flush it directly */
938            if ((ret = flush_cache_ent(ents[i])) != 0)
939                break;
940            continue;
941        }
942
943
944        for(k=i,iocnt=0; k < j; k++,iocnt++) {
945            if (ents[k]->clone)
946                iov[iocnt].iov_base = ents[k]->clone;
947            else
948                iov[iocnt].iov_base = ents[k]->data;
949
950            iov[iocnt].iov_len = bsize;
951        }
952
953		if (chatty_io)
954	        printf("writev @ %Ld for %d blocks\n", start_bnum, iocnt);
955
956        ret = writev_pos(ents[i]->dev, start_bnum * (fs_off_t)bsize,
957                         &iov[0], iocnt);
958        if (ret != iocnt*bsize) {
959            int idx;
960
961            printf("flush_ents: writev failed: iocnt %d start bnum %Ld "
962                   "bsize %d, ret %d\n", iocnt, start_bnum, bsize, ret);
963
964            for(idx=0; idx < iocnt; idx++)
965                printf("iov[%2d] = %p :: %ld\n", idx, iov[idx].iov_base,
966                       (long)iov[idx].iov_len);
967
968            printf("error %s writing blocks %Ld:%d (%d != %d)\n",
969                   strerror(errno), start_bnum, iocnt, ret, iocnt*bsize);
970            ret = FS_EINVAL;
971            break;
972        }
973        ret = 0;
974
975
976        for(k=i; k < j; k++) {
977            if (ents[k]->func) {
978                ents[k]->func(ents[k]->logged_bnum, 1, ents[k]->arg);
979                ents[k]->func = NULL;
980            }
981
982            if (ents[k]->clone) {
983                free(ents[k]->clone);
984                ents[k]->clone = NULL;
985            } else {
986                ents[k]->flags &= ~CE_DIRTY;
987            }
988        }
989
990
991        i = j - 1;  /* i gets incremented by the outer for loop */
992    }
993
994    /*
995       here we have to go back through and flush any blocks that are
996       still dirty.  with an arched brow you astutely ask, "but how
997       could this happen given the above loop?"  Ahhh young grasshopper
998       I say, the path through the cache is long and twisty and fraught
999       with peril.  The reason it can happen is that a block can be both
1000       cloned and dirty.  The above loop would only flush the cloned half
1001       of the data, not the main dirty block.  So we have to go back
1002       through and see if there are any blocks that are still dirty.  If
1003       there are we go back to the top of the function and do the whole
1004       thing over.  Kind of grody but it is necessary to insure the
1005       correctness of the log for the Be file system.
1006    */
1007    if (do_again == 0) {
1008        for(i=0; i < n_ents; i++) {
1009            if ((ents[i]->flags & CE_DIRTY) == 0 || ents[i]->lock)
1010                continue;
1011
1012            do_again = 1;
1013            break;
1014        }
1015
1016        if (do_again)
1017            goto restart;
1018    }
1019
1020    release_iovec_array(iov);
1021
1022
1023    return ret;
1024}
1025
1026static void
1027delete_cache_list(cache_ent_list *cel)
1028{
1029    void      *junk;
1030    cache_ent *ce, *next;
1031
1032    for (ce = cel->lru; ce; ce = next) {
1033        next = ce->next;
1034        if (ce->lock != 0) {
1035            if (ce->func)
1036                printf("*** shutdown_block_cache: block %Ld, lock == %d "
1037                       "(arg %p)!\n", ce->block_num, ce->lock, ce->arg);
1038            else
1039                printf("*** shutdown_block_cache: block %Ld, lock == %d!\n",
1040                       ce->block_num, ce->lock);
1041        }
1042
1043        if (ce->flags & CE_BUSY) {
1044            printf("* shutdown block cache: bnum %Ld is busy? ce %p\n",
1045                   ce->block_num, ce);
1046        }
1047
1048        if ((ce->flags & CE_DIRTY) || ce->clone) {
1049            flush_cache_ent(ce);
1050        }
1051
1052        if (ce->clone)
1053            free(ce->clone);
1054        ce->clone = NULL;
1055
1056        if (ce->data)
1057            free(ce->data);
1058        ce->data = NULL;
1059
1060        if ((junk = hash_delete(&bc.ht, ce->dev, ce->block_num)) != ce) {
1061            printf("*** free_device_cache: bad hash table entry %Ld "
1062                   "%p != %p\n", ce->block_num, junk, ce);
1063        }
1064
1065        memset(ce, 0xfd, sizeof(*ce));
1066        free(ce);
1067
1068        bc.cur_blocks--;
1069    }
1070}
1071
1072
1073void
1074shutdown_block_cache(void)
1075{
1076    /* print_hash_stats(&bc.ht); */
1077
1078    if (bc.lock.s > 0)
1079        LOCK(bc.lock);
1080
1081#ifndef USER
1082    unregister_kernel_daemon(cache_flusher, NULL);
1083#endif
1084
1085    delete_cache_list(&bc.normal);
1086    delete_cache_list(&bc.locked);
1087
1088    bc.normal.lru = bc.normal.mru = NULL;
1089    bc.locked.lru = bc.locked.mru = NULL;
1090
1091    shutdown_hash_table(&bc.ht);
1092
1093    if (bc.lock.s > 0)
1094        free_lock(&bc.lock);
1095    bc.lock.s = -1;
1096
1097    if (iovec_lock.s >= 0)
1098        free_lock(&iovec_lock);
1099}
1100
1101
1102
1103int
1104init_cache_for_device(int fd, fs_off_t max_blocks)
1105{
1106    int ret = 0;
1107
1108printf("init_cache_for_device(%d, %lld)\n", fd, max_blocks);
1109
1110    if (fd >= MAX_DEVICES)
1111        return -1;
1112
1113    LOCK(bc.lock);
1114
1115    if (max_device_blocks[fd] != 0) {
1116        printf("device %d is already initialized!\n", fd);
1117        ret = -1;
1118    } else {
1119        max_device_blocks[fd] = max_blocks;
1120    }
1121
1122    UNLOCK(bc.lock);
1123
1124    return ret;
1125}
1126
1127
1128
1129/*
1130   this routine assumes that bc.lock has been acquired
1131*/
1132static cache_ent *
1133block_lookup(int dev, fs_off_t bnum)
1134{
1135    int        count = 0;
1136    cache_ent *ce;
1137
1138    while (1) {
1139        ce = hash_lookup(&bc.ht, dev, bnum);
1140        if (ce == NULL)
1141            return NULL;
1142
1143        if ((ce->flags & CE_BUSY) == 0) /* it's ok, break out and return it */
1144            break;
1145
1146        /* else, it's busy and we need to retry our lookup */
1147        UNLOCK(bc.lock);
1148
1149        snooze(5000);
1150        if (count++ == 5000) {  /* then a lot of time has elapsed */
1151            printf("block %Ld isn't coming un-busy (ce @ %p)\n",
1152                   ce->block_num, ce);
1153        }
1154
1155        LOCK(bc.lock);
1156    }
1157
1158    if (ce->flags & CE_BUSY)
1159        panic("block lookup: returning a busy block @ 0x%lx?!?\n",(ulong)ce);
1160
1161    return ce;
1162}
1163
1164
1165
1166int
1167set_blocks_info(int dev, fs_off_t *blocks, int nblocks,
1168               void (*func)(fs_off_t bnum, size_t nblocks, void *arg), void *arg)
1169{
1170    int        i, j, cur;
1171    cache_ent *ce;
1172    cache_ent *ents[NUM_FLUSH_BLOCKS];
1173
1174    LOCK(bc.lock);
1175
1176
1177    for(i=0, cur=0; i < nblocks; i++) {
1178
1179        /* printf("sbi:   %ld (arg 0x%x)\n", blocks[i], arg); */
1180        ce = block_lookup(dev, blocks[i]);
1181        if (ce == NULL) {
1182            panic("*** set_block_info can't find bnum %ld!\n", blocks[i]);
1183            UNLOCK(bc.lock);
1184            return FS_ENOENT;   /* hopefully this doesn't happen... */
1185        }
1186
1187
1188        if (blocks[i] != ce->block_num || dev != ce->dev) {
1189            UNLOCK(bc.lock);
1190            panic("** error1: looked up dev %d block %ld but found dev %d "
1191                    "bnum %ld\n", dev, blocks[i], ce->dev, ce->block_num);
1192            return FS_EBADF;
1193        }
1194
1195        if (ce->lock == 0) {
1196            panic("* set_block_info on bnum %ld (%d) but it's not locked!\n",
1197                    blocks[i], nblocks);
1198        }
1199
1200
1201        if ((ce->flags & CE_DIRTY) == 0) {
1202            panic("*** set_block_info on non-dirty block bnum %ld (%d)!\n",
1203                    blocks[i], nblocks);
1204        }
1205
1206        ce->flags |= CE_BUSY;     /* mark all blocks as busy till we're done */
1207
1208        /* if there is cloned data, it needs to be flushed now */
1209        if (ce->clone && ce->func) {
1210            ents[cur++] = ce;
1211
1212            if (cur >= NUM_FLUSH_BLOCKS) {
1213                UNLOCK(bc.lock);
1214
1215                qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1216
1217                flush_ents(ents, cur);
1218
1219                LOCK(bc.lock);
1220                for(j=0; j < cur; j++)
1221                    ents[j]->flags &= ~CE_BUSY;
1222                cur = 0;
1223            }
1224        }
1225    }
1226
1227
1228    if (cur != 0) {
1229        UNLOCK(bc.lock);
1230
1231        qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1232
1233        flush_ents(ents, cur);
1234
1235        LOCK(bc.lock);
1236        for(j=0; j < cur; j++)
1237            ents[j]->flags &= ~CE_BUSY;
1238        cur = 0;
1239    }
1240
1241
1242    /* now go through and set the info that we were asked to */
1243    for (i = 0; i < nblocks; i++) {
1244        /* we can call hash_lookup() here because we know it's around */
1245        ce = hash_lookup(&bc.ht, dev, blocks[i]);
1246        if (ce == NULL) {
1247            panic("*** set_block_info can't find bnum %Ld!\n", blocks[i]);
1248            UNLOCK(bc.lock);
1249            return FS_ENOENT;   /* hopefully this doesn't happen... */
1250        }
1251
1252        ce->flags &= ~(CE_DIRTY | CE_BUSY);
1253
1254        if (ce->func != NULL) {
1255            panic("*** set_block_info non-null callback on bnum %Ld\n",
1256                    ce->block_num);
1257        }
1258
1259        if (ce->clone != NULL) {
1260            panic("*** ce->clone == %p, not NULL in set_block_info\n",
1261                    ce->clone);
1262        }
1263
1264        ce->clone = (void *)malloc(ce->bsize);
1265        if (ce->clone == NULL)
1266            panic("*** can't clone bnum %Ld (bsize %d)\n",
1267                    ce->block_num, ce->bsize);
1268
1269
1270        memcpy(ce->clone, ce->data, ce->bsize);
1271
1272        ce->func   = func;
1273        ce->arg    = arg;
1274
1275        ce->logged_bnum = blocks[i];
1276
1277        ce->lock--;
1278        if (ce->lock < 0) {
1279            printf("sbi: whoa nellie! ce @ %p (%Ld) has lock == %d\n",
1280                   ce, ce->block_num, ce->lock);
1281        }
1282
1283        if (ce->lock == 0) {
1284            delete_from_list(&bc.locked, ce);
1285            add_to_head(&bc.normal, ce);
1286        }
1287    }
1288
1289    UNLOCK(bc.lock);
1290
1291    return 0;
1292}
1293
1294
1295/* this function is only for use by flush_device() */
1296static void
1297do_flush(cache_ent **ents, int max)
1298{
1299    int i;
1300
1301    for(i=0; i < max; i++) {
1302        ents[i]->flags |= CE_BUSY;
1303    }
1304
1305    UNLOCK(bc.lock);
1306
1307    qsort(ents, max, sizeof(cache_ent **), cache_ent_cmp);
1308    flush_ents(ents, max);
1309
1310    LOCK(bc.lock);
1311    for(i=0; i < max; i++) {
1312        ents[i]->flags &= ~CE_BUSY;
1313    }
1314}
1315
1316int
1317flush_device(int dev, int warn_locked)
1318{
1319    int cur;
1320    cache_ent *ce;
1321    cache_ent *ents[NUM_FLUSH_BLOCKS];
1322
1323    LOCK(bc.lock);
1324
1325    cur = 0;
1326    ce = bc.normal.lru;
1327    while (ce) {
1328        if (ce->dev != dev || (ce->flags & CE_BUSY)) {
1329            ce = ce->next;
1330            continue;
1331        }
1332
1333        if ((ce->flags & CE_DIRTY) || ce->clone) {
1334            ents[cur++] = ce;
1335            if (cur >= NUM_FLUSH_BLOCKS) {
1336                do_flush(ents, cur);
1337
1338                ce = bc.normal.lru;
1339                cur = 0;
1340                continue;
1341            }
1342        }
1343
1344        ce = ce->next;
1345    }
1346
1347    if (cur != 0)
1348        do_flush(ents, cur);
1349
1350    cur = 0;
1351    ce = bc.locked.lru;
1352    while (ce) {
1353        if (ce->dev != dev || (ce->flags & CE_BUSY)) {
1354            ce = ce->next;
1355            continue;
1356        }
1357
1358        if (ce->clone) {
1359            ents[cur++] = ce;
1360            if (cur >= NUM_FLUSH_BLOCKS) {
1361                do_flush(ents, cur);
1362
1363                ce = bc.locked.lru;
1364                cur = 0;
1365                continue;
1366            }
1367        }
1368
1369        ce = ce->next;
1370    }
1371
1372    if (cur != 0)
1373        do_flush(ents, cur);
1374
1375    UNLOCK(bc.lock);
1376
1377    return 0;
1378}
1379
1380
1381static void
1382real_remove_cached_blocks(int dev, int allow_writes, cache_ent_list *cel)
1383{
1384    void      *junk;
1385    cache_ent *ce, *next = NULL;
1386
1387    for(ce=cel->lru; ce; ce=next) {
1388        next = ce->next;
1389
1390        if (ce->dev != dev) {
1391            continue;
1392        }
1393
1394        if (ce->lock != 0 || (ce->flags & CE_BUSY)) {
1395            printf("*** remove_cached_dev: block %Ld has lock = %d, flags "
1396                   "0x%x! ce @ 0x%lx\n", ce->block_num, ce->lock, ce->flags,
1397                   (ulong)ce);
1398        }
1399
1400        if (allow_writes == ALLOW_WRITES &&
1401            ((ce->flags & CE_DIRTY) || ce->clone)) {
1402            ce->flags |= CE_BUSY;
1403            flush_cache_ent(ce);
1404            ce->flags &= ~CE_BUSY;
1405        }
1406
1407        /* unlink this guy */
1408        if (cel->lru == ce)
1409            cel->lru = ce->next;
1410
1411        if (cel->mru == ce)
1412            cel->mru = ce->prev;
1413
1414        if (ce->prev)
1415            ce->prev->next = ce->next;
1416        if (ce->next)
1417            ce->next->prev = ce->prev;
1418
1419        if (ce->clone)
1420            free(ce->clone);
1421        ce->clone = NULL;
1422
1423        if (ce->data)
1424            free(ce->data);
1425        ce->data = NULL;
1426
1427        if ((junk = hash_delete(&bc.ht, ce->dev, ce->block_num)) != ce) {
1428            panic("*** remove_cached_device: bad hash table entry %ld "
1429                   "0x%lx != 0x%lx\n", ce->block_num, (ulong)junk, (ulong)ce);
1430        }
1431
1432        free(ce);
1433
1434        bc.cur_blocks--;
1435    }
1436
1437}
1438
1439int
1440remove_cached_device_blocks(int dev, int allow_writes)
1441{
1442printf("remove_cached_device_blocks(%d, %d)\n", dev, allow_writes);
1443
1444	LOCK(bc.lock);
1445
1446    real_remove_cached_blocks(dev, allow_writes, &bc.normal);
1447    real_remove_cached_blocks(dev, allow_writes, &bc.locked);
1448
1449    max_device_blocks[dev] = 0;
1450
1451    UNLOCK(bc.lock);
1452
1453    return 0;
1454}
1455
1456
1457
1458int
1459flush_blocks(int dev, fs_off_t bnum, int nblocks)
1460{
1461    int        cur, i;
1462    cache_ent *ce;
1463    cache_ent *ents[NUM_FLUSH_BLOCKS];
1464
1465    if (nblocks == 0)   /* might as well check for this */
1466        return 0;
1467
1468    LOCK(bc.lock);
1469
1470    cur = 0;
1471    for(; nblocks > 0; nblocks--, bnum++) {
1472        ce = block_lookup(dev, bnum);
1473        if (ce == NULL)
1474            continue;
1475
1476        if (bnum != ce->block_num || dev != ce->dev) {
1477            UNLOCK(bc.lock);
1478            panic("error2: looked up dev %d block %ld but found %d %ld\n",
1479                  dev, bnum, ce->dev, ce->block_num);
1480            return FS_EBADF;
1481        }
1482
1483        if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL)
1484            continue;
1485
1486        ce->flags |= CE_BUSY;
1487        ents[cur++] = ce;
1488
1489        if (cur >= NUM_FLUSH_BLOCKS) {
1490            UNLOCK(bc.lock);
1491
1492            qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1493            flush_ents(ents, cur);
1494
1495            LOCK(bc.lock);
1496            for(i=0; i < cur; i++) {
1497                ents[i]->flags &= ~CE_BUSY;
1498            }
1499            cur = 0;
1500        }
1501    }
1502
1503    UNLOCK(bc.lock);
1504
1505    if (cur == 0)     /* nothing more to do */
1506        return 0;
1507
1508    /* flush out the last few buggers */
1509    qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1510    flush_ents(ents, cur);
1511
1512    for(i=0; i < cur; i++) {
1513        ents[i]->flags &= ~CE_BUSY;
1514    }
1515
1516    return 0;
1517}
1518
1519
1520int
1521mark_blocks_dirty(int dev, fs_off_t bnum, int nblocks)
1522{
1523    int        ret = 0;
1524    cache_ent *ce;
1525
1526    LOCK(bc.lock);
1527
1528    while(nblocks > 0) {
1529        ce = block_lookup(dev, bnum);
1530        if (ce) {
1531            ce->flags |= CE_DIRTY;
1532            bnum      += 1;
1533            nblocks   -= 1;
1534        } else {     /* hmmm, that's odd, didn't find it */
1535            printf("** mark_blocks_diry couldn't find block %Ld (len %d)\n",
1536                   bnum, nblocks);
1537            ret = FS_ENOENT;
1538            break;
1539        }
1540    }
1541
1542    UNLOCK(bc.lock);
1543
1544    return ret;
1545}
1546
1547
1548
1549int
1550release_block(int dev, fs_off_t bnum)
1551{
1552    cache_ent *ce;
1553
1554    /* printf("rlsb: %ld\n", bnum); */
1555    LOCK(bc.lock);
1556
1557    ce = block_lookup(dev, bnum);
1558    if (ce) {
1559        if (bnum != ce->block_num || dev != ce->dev) {
1560            panic("*** error3: looked up dev %d block %ld but found %d %ld\n",
1561                    dev, bnum, ce->dev, ce->block_num);
1562            UNLOCK(bc.lock);
1563            return FS_EBADF;
1564        }
1565
1566        ce->lock--;
1567
1568        if (ce->lock < 0) {
1569            printf("rlsb: whoa nellie! ce %Ld has lock == %d\n",
1570                   ce->block_num, ce->lock);
1571        }
1572
1573        if (ce->lock == 0) {
1574            delete_from_list(&bc.locked, ce);
1575            add_to_head(&bc.normal, ce);
1576        }
1577
1578    } else {     /* hmmm, that's odd, didn't find it */
1579        panic("** release_block asked to find %ld but it's not here\n",
1580               bnum);
1581    }
1582
1583    UNLOCK(bc.lock);
1584
1585    return 0;
1586}
1587
1588
1589static cache_ent *
1590new_cache_ent(int bsize)
1591{
1592    cache_ent *ce;
1593
1594    ce = (cache_ent *)calloc(1, sizeof(cache_ent));
1595    if (ce == NULL) {
1596        panic("*** error: cache can't allocate memory!\n");
1597        return NULL;
1598    }
1599
1600    ce->data = malloc(bsize);
1601    if (ce->data == NULL) {
1602        free(ce);
1603        panic("** error cache can't allocate data memory\n");
1604        UNLOCK(bc.lock);
1605        return NULL;
1606    }
1607
1608    ce->dev       = -1;
1609    ce->block_num = -1;
1610
1611    return ce;
1612}
1613
1614
1615static void
1616get_ents(cache_ent **ents, int num_needed, int max, int *num_gotten, int bsize)
1617{
1618    int        cur, retry_counter = 0, max_retry = num_needed * 256;
1619    cache_ent *ce;
1620
1621    if (num_needed > max)
1622        panic("get_ents: num_needed %d but max %d (doh!)\n", num_needed, max);
1623
1624    /* if the cache isn't full yet, just allocate the blocks */
1625    for(cur=0; bc.cur_blocks < bc.max_blocks && cur < num_needed; cur++) {
1626        ents[cur] = new_cache_ent(bsize);
1627        if (ents[cur] == NULL)
1628            break;
1629        bc.cur_blocks++;
1630    }
1631
1632    /* pluck off blocks from the LRU end of the normal list, keep trying too */
1633    while(cur < num_needed && retry_counter < max_retry) {
1634        for(ce=bc.normal.lru; ce && cur < num_needed; ce=ce->next) {
1635            if (ce->lock)
1636                panic("get_ents: normal list has locked blocks (ce 0x%x)\n",ce);
1637
1638            if (ce->flags & CE_BUSY)   /* don't touch busy blocks */
1639                continue;
1640
1641            ce->flags   |= CE_BUSY;
1642            ents[cur++]  = ce;
1643        }
1644
1645        if (cur < num_needed) {
1646            UNLOCK(bc.lock);
1647            snooze(10000);
1648            LOCK(bc.lock);
1649            retry_counter++;
1650        }
1651    }
1652
1653    if (cur < num_needed && retry_counter >= max_retry) {  /* oh shit! */
1654        dump_cache_list();
1655        UNLOCK(bc.lock);
1656        panic("get_ents: waited too long; can't get enough ce's (c %d n %d)\n",
1657              cur, num_needed);
1658    }
1659
1660    /*
1661      If the last block is a dirty one, try to get more of 'em so
1662      that we can flush a bunch of blocks at once.
1663    */
1664    if (cur && cur < max &&
1665        ((ents[cur-1]->flags & CE_DIRTY) || ents[cur-1]->clone)) {
1666
1667        for(ce=ents[cur-1]->next; ce && cur < max; ce=ce->next) {
1668            if (ce->flags & CE_BUSY)   /* don't touch busy blocks */
1669                continue;
1670
1671            if (ce->lock)
1672                panic("get_ents:2 dirty list has locked blocks (ce 0x%x)\n",ce);
1673
1674            ce->flags   |= CE_BUSY;
1675            ents[cur++]  = ce;
1676        }
1677    }
1678
1679    *num_gotten = cur;
1680}
1681
1682
1683static int
1684read_into_ents(int dev, fs_off_t bnum, cache_ent **ents, int num, int bsize)
1685{
1686    int    i, ret;
1687    struct iovec *iov;
1688
1689    iov = get_iovec_array();
1690
1691    for (i = 0; i < num; i++) {
1692        iov[i].iov_base = ents[i]->data;
1693        iov[i].iov_len  = bsize;
1694    }
1695
1696	if (chatty_io > 2)
1697		printf("readv @ %Ld for %d blocks (at %Ld, block_size = %d)\n", bnum, num, bnum*bsize, bsize);
1698    ret = readv_pos(dev, bnum*bsize, iov, num);
1699
1700    release_iovec_array(iov);
1701
1702    if (ret != num*bsize) {
1703        printf("read_into_ents: asked to read %d bytes but got %d\n",
1704               num*bsize, ret);
1705        printf("*** iov @ %p (num %d)\n", iov, num);
1706        return FS_EINVAL;
1707    } else
1708        return 0;
1709}
1710
1711
1712
1713#define CACHE_READ          0x0001
1714#define CACHE_WRITE         0x0002
1715#define CACHE_NOOP          0x0004     /* for getting empty blocks */
1716#define CACHE_LOCKED        0x0008
1717#define CACHE_READ_AHEAD_OK 0x0010     /* it's ok to do read-ahead */
1718
1719
1720static char *
1721op_to_str(int op)
1722{
1723    static char buff[128];
1724
1725    if (op & CACHE_READ)
1726        strcpy(buff, "READ");
1727    else if (op & CACHE_WRITE)
1728        strcpy(buff, "WRITE");
1729    else if (op & CACHE_NOOP)
1730        strcpy(buff, "NOP");
1731
1732    if (op & CACHE_LOCKED)
1733        strcat(buff, " LOCKED");
1734
1735    if (op & CACHE_READ_AHEAD_OK)
1736        strcat(buff, " (AHEAD)");
1737
1738    return buff;
1739}
1740
1741static int
1742cache_block_io(int dev, fs_off_t bnum, void *data, fs_off_t num_blocks, int bsize,
1743               int op, void **dataptr)
1744{
1745    size_t          err = 0;
1746    cache_ent      *ce;
1747    cache_ent_list *cel = NULL;
1748
1749    if (chatty_io > 1)
1750        printf("cbio: bnum = %Ld, num_blocks = %Ld, bsize = %d, op = %s\n", bnum, num_blocks,
1751               bsize, op_to_str(op));
1752
1753    /* some sanity checks first */
1754    if (bsize == 0)
1755        panic("cache_io: block size == 0 for bnum %Ld?!?\n", bnum);
1756
1757    if (num_blocks == 0)
1758        panic("cache_io: bnum %Ld has num_blocks == 0!\n", bnum);
1759
1760    if (data == NULL && dataptr == NULL) {
1761        printf("major butthead move: null data and dataptr! bnum %Ld:%Ld\n",
1762                bnum, num_blocks);
1763        return FS_ENOMEM;
1764    }
1765
1766    if (data == NULL) {
1767        if (num_blocks != 1)    /* get_block() should never do that */
1768            panic("cache_io: num_blocks %Ld but should be 1\n",
1769                  num_blocks);
1770
1771        if (op & CACHE_WRITE)
1772            panic("cache_io: get_block() asked to write?!?\n");
1773    }
1774
1775    if (bnum + num_blocks > max_device_blocks[dev]) {
1776        printf("dev %d: access to blocks %Ld:%Ld but max_dev_blocks is %Ld\n",
1777               dev, bnum, num_blocks, max_device_blocks[dev]);
1778
1779		// let the app crash here
1780		debugger("Accessed blocks out of device range!");
1781		//*(int *)0x3100 = 0xc0debabe;
1782
1783        return FS_EINVAL;
1784    }
1785
1786    last_cache_access = system_time();
1787
1788    /* if the i/o is greater than 64k, do it directly */
1789    if (num_blocks * bsize >= 64 * 1024) {
1790        char  *ptr;
1791        fs_off_t  tmp;
1792
1793        if (data == NULL || (op & CACHE_LOCKED)) {
1794            panic("*** asked to do a large locked io that's too hard!\n");
1795        }
1796
1797        if (op & CACHE_READ) {
1798            if (read_phys_blocks(dev, bnum, data, num_blocks, bsize) != 0) {
1799                printf("cache read:read_phys_blocks failed (%s on blocks %Ld:%Ld)!\n",
1800                        strerror(errno), bnum, num_blocks);
1801                return FS_EINVAL;
1802            }
1803
1804            LOCK(bc.lock);
1805
1806            /* if any of the blocks are in the cache, grab them instead */
1807            ptr = data;
1808            for(tmp=bnum; tmp < bnum+num_blocks; tmp++, ptr+=bsize) {
1809                ce = block_lookup(dev, tmp);
1810                /*
1811                    if we find a block in the cache we have to copy its
1812                    data just in case it is more recent than what we just
1813                    read from disk (which could happen if someone wrote
1814                    these blocks after we did the read but before we locked
1815                    the cache and entered this loop).
1816                */
1817                if (ce) {
1818                    if (tmp != ce->block_num || dev != ce->dev) {
1819                        UNLOCK(bc.lock);
1820                        panic("*** error4: looked up dev %d block %Ld but "
1821                                "found %d %Ld\n", dev, tmp, ce->dev,
1822                                ce->block_num);
1823                    }
1824
1825                    memcpy(ptr, ce->data, bsize);
1826                }
1827            }
1828
1829            UNLOCK(bc.lock);
1830        } else if (op & CACHE_WRITE) {
1831            LOCK(bc.lock);
1832
1833            /* if any of the blocks are in the cache, update them too */
1834            ptr = data;
1835            for(tmp=bnum; tmp < bnum+num_blocks; tmp++, ptr+=bsize) {
1836                ce = block_lookup(dev, tmp);
1837                if (ce) {
1838                    if (tmp != ce->block_num || dev != ce->dev) {
1839                        UNLOCK(bc.lock);
1840                        panic("*** error5: looked up dev %d block %Ld but "
1841                                "found %d %Ld\n", dev, tmp, ce->dev,
1842                                ce->block_num);
1843                        return FS_EBADF;
1844                    }
1845
1846                    /* XXXdbg -- this isn't strictly necessary */
1847                    if (ce->clone) {
1848                        printf("over-writing cloned data (ce %p bnum %Ld)...\n", ce, tmp);
1849                        flush_cache_ent(ce);
1850                    }
1851
1852                    /* copy the data into the cache */
1853                    memcpy(ce->data, ptr, bsize);
1854                }
1855            }
1856
1857            UNLOCK(bc.lock);
1858
1859            if (write_phys_blocks(dev, bnum, data, num_blocks, bsize) != 0) {
1860                printf("cache write: write_phys_blocks failed (%s on blocks "
1861                       "%Ld:%Ld)!\n", strerror(errno), bnum, num_blocks);
1862                return FS_EINVAL;
1863            }
1864        } else {
1865            printf("bad cache op %d (bnum %Ld nblocks %Ld)\n", op, bnum, num_blocks);
1866            return FS_EINVAL;
1867        }
1868
1869        return 0;
1870    }
1871
1872
1873    LOCK(bc.lock);
1874    while(num_blocks) {
1875
1876        ce = block_lookup(dev, bnum);
1877        if (ce) {
1878            if (bnum != ce->block_num || dev != ce->dev) {
1879                UNLOCK(bc.lock);
1880                panic("*** error6: looked up dev %d block %ld but found "
1881                        "%d %ld\n", dev, bnum, ce->dev, ce->block_num);
1882                return FS_EBADF;
1883            }
1884
1885            if (bsize != ce->bsize) {
1886                panic("*** requested bsize %d but ce->bsize %d ce @ 0x%x\n",
1887                        bsize, ce->bsize, ce);
1888            }
1889
1890            /* delete this ent from the list it is in because it may change */
1891            if (ce->lock)
1892                cel = &bc.locked;
1893            else
1894                cel = &bc.normal;
1895
1896            delete_from_list(cel, ce);
1897
1898            if (op & CACHE_READ) {
1899                if (data && data != ce->data) {
1900                    memcpy(data, ce->data, bsize);
1901                } else if (dataptr) {
1902                    *dataptr = ce->data;
1903                } else {
1904                    printf("cbio:data %p dptr %p ce @ %p ce->data %p\n",
1905                           data, dataptr, ce, ce->data);
1906                }
1907            } else if (op & CACHE_WRITE) {
1908                if (data && data != ce->data)
1909                    memcpy(ce->data, data, bsize);
1910
1911                ce->flags |= CE_DIRTY;
1912            } else if (op & CACHE_NOOP) {
1913                memset(ce->data, 0, bsize);
1914                if (data)
1915                    memset(data, 0, bsize);
1916
1917                if (dataptr)
1918                    *dataptr = ce->data;
1919
1920                ce->flags |= CE_DIRTY;
1921            } else {
1922                panic("cached_block_io: bogus op %d\n", op);
1923            }
1924
1925            if (op & CACHE_LOCKED)
1926                ce->lock++;
1927
1928            if (ce->lock)
1929                cel = &bc.locked;
1930            else
1931                cel = &bc.normal;
1932
1933            /* now put this ent at the head of the appropriate list */
1934            add_to_head(cel, ce);
1935
1936            if (data != NULL)
1937                data = (void *)((char *)data + bsize);
1938
1939            bnum       += 1;
1940            num_blocks -= 1;
1941
1942            continue;
1943        } else {                                  /* it's not in the cache */
1944            int        cur, cur_nblocks, num_dirty, real_nblocks, num_needed;
1945            cache_ent *ents[NUM_FLUSH_BLOCKS];
1946
1947            /*
1948               here we find out how many additional blocks in this request
1949               are not in the cache.  the idea is that then we can do one
1950               big i/o on that many blocks at once.
1951            */
1952            for(cur_nblocks=1;
1953                cur_nblocks < num_blocks && cur_nblocks < NUM_FLUSH_BLOCKS;
1954                cur_nblocks++) {
1955
1956                /* we can call hash_lookup() directly instead of
1957                   block_lookup() because we don't care about the
1958                   state of the busy bit of the block at this point
1959                */
1960                if (hash_lookup(&bc.ht, dev, bnum + cur_nblocks))
1961                    break;
1962            }
1963
1964            /*
1965              here we try to figure out how many extra blocks we should read
1966              for read-ahead.  we want to read as many as possible that are
1967              not already in the cache and that don't cause us to try and
1968              read beyond the end of the disk.
1969            */
1970            if ((op & CACHE_READ) && (op & CACHE_READ_AHEAD_OK) &&
1971                (cur_nblocks * bsize) < read_ahead_size) {
1972
1973                for(num_needed=cur_nblocks;
1974                    num_needed < (read_ahead_size / bsize);
1975                    num_needed++) {
1976
1977                    if ((bnum + num_needed) >= max_device_blocks[dev])
1978                        break;
1979
1980                    if (hash_lookup(&bc.ht, dev, bnum + num_needed))
1981                        break;
1982                }
1983            } else {
1984                num_needed = cur_nblocks;
1985            }
1986
1987            /* this will get us pointers to a bunch of cache_ents we can use */
1988            get_ents(ents, num_needed, NUM_FLUSH_BLOCKS, &real_nblocks, bsize);
1989
1990            if (real_nblocks < num_needed) {
1991                panic("don't have enough cache ents (need %d got %d %ld::%d)\n",
1992                      num_needed, real_nblocks, bnum, num_blocks);
1993            }
1994
1995            /*
1996              There are now three variables used as limits within the ents
1997              array.  This is how they are related:
1998
1999                 cur_nblocks <= num_needed <= real_nblocks
2000
2001              Ents from 0 to cur_nblocks-1 are going to be used to fulfill
2002              this IO request.  Ents from cur_nblocks to num_needed-1 are
2003              for read-ahead.  Ents from num_needed to real_nblocks are
2004              extra blocks that get_ents() asked us to flush.  Often (and
2005              always on writes) cur_nblocks == num_needed.
2006
2007              Below, we sort the list of ents so that when we flush them
2008              they go out in order.
2009            */
2010
2011            qsort(ents, real_nblocks, sizeof(cache_ent **), cache_ent_cmp);
2012
2013            /*
2014              delete each ent from its list because it will change.  also
2015              count up how many dirty blocks there are and insert into the
2016              hash table any new blocks so that no one else will try to
2017              read them in when we release the cache semaphore to do our I/O.
2018            */
2019            for(cur=0,num_dirty=0; cur < real_nblocks; cur++) {
2020                ce = ents[cur];
2021                ce->flags |= CE_BUSY;
2022
2023                /*
2024                   insert the new block into the hash table with its new block
2025                   number. note that the block is still in the hash table for
2026                   its old block number -- and it has to be until we are done
2027                   flushing it from the cache (to prevent someone else from
2028                   sneaking in in front of us and trying to read the same
2029                   block that we're flushing).
2030                */
2031                if (cur < num_needed) {
2032                    if (hash_insert(&bc.ht, dev, bnum + cur, ce) != 0)
2033                        panic("could not insert cache ent for %d %ld (0x%lx)\n",
2034                              dev, bnum + cur, (ulong)ents[cur]);
2035                }
2036
2037                if (ce->dev == -1)
2038                    continue;
2039
2040                if ((ce->flags & CE_DIRTY) || ce->clone)
2041                    num_dirty++;
2042
2043                if (ce->lock)
2044                    panic("cbio: can't use locked blocks here ce @ 0x%x\n",ce);
2045                else
2046                    cel = &bc.normal;
2047
2048                delete_from_list(cel, ce);
2049            }
2050            ce = NULL;
2051
2052
2053            /*
2054               we release the block cache semaphore here so that we can
2055               go do all the i/o we need to do (flushing dirty blocks
2056               that we're kicking out as well as reading any new data).
2057
2058               because all the blocks we're touching are marked busy
2059               no one else should mess with them while we're doing this.
2060            */
2061            if (num_dirty || (op & CACHE_READ)) {
2062                UNLOCK(bc.lock);
2063
2064                /* this flushes any blocks we're kicking out that are dirty */
2065                if (num_dirty && (err = flush_ents(ents, real_nblocks)) != 0) {
2066                    printf("flush ents failed (ents @ 0x%lx, nblocks %d!\n",
2067                           (ulong)ents, cur_nblocks);
2068                    goto handle_err;
2069                }
2070
2071            }
2072
2073            /*
2074               now that everything is flushed to disk, go through and
2075               make sure that the data blocks we're going to use are
2076               the right block size for this current request (it's
2077               possible we're kicking out some smaller blocks and need
2078               to reallocate the data block pointer). We do this in two
2079               steps, first free'ing everything and then going through
2080               and doing the malloc's to try and be nice to the memory
2081               system (i.e. allow it to coalesce stuff, etc).
2082            */
2083            err = 0;
2084            for(cur=0; cur < num_needed; cur++) {
2085                if (ents[cur]->bsize != bsize) {
2086                    free(ents[cur]->data);
2087                    ents[cur]->data = NULL;
2088
2089                    if (ents[cur]->clone) {
2090                        free(ents[cur]->clone);
2091                        ents[cur]->clone = NULL;
2092                    }
2093                }
2094            }
2095
2096            for(cur=0; cur < num_needed; cur++) {
2097                if (ents[cur]->data == NULL) {
2098                    ents[cur]->data  = (void *)malloc(bsize);
2099                    ents[cur]->bsize = bsize;
2100                }
2101
2102                if (ents[cur]->data == NULL) {
2103                    printf("cache: no memory for block (bsize %d)!\n",
2104                           bsize);
2105                    err = FS_ENOMEM;
2106                    break;
2107                }
2108            }
2109
2110            /*
2111               if this condition is true it's a pretty serious error.
2112               we'll try and back out gracefully but we're in pretty
2113               deep at this point and it ain't going to be easy.
2114            */
2115  handle_err:
2116            if (err) {
2117                for(cur=0; cur < num_needed; cur++) {
2118                    cache_ent *tmp_ce;
2119
2120                    tmp_ce = (cache_ent *)hash_delete(&bc.ht,dev,bnum+cur);
2121                    if (tmp_ce != ents[cur]) {
2122                        panic("hash_del0: %d %ld got 0x%lx, not 0x%lx\n",
2123                                dev, bnum+cur, (ulong)tmp_ce,
2124                                (ulong)ents[cur]);
2125                    }
2126
2127                    tmp_ce = (cache_ent *)hash_delete(&bc.ht,ents[cur]->dev,
2128                                                        ents[cur]->block_num);
2129                    if (tmp_ce != ents[cur]) {
2130                        panic("hash_del1: %d %ld got 0x%lx, not 0x%lx\n",
2131                                ents[cur]->dev, ents[cur]->block_num, (ulong)tmp_ce,
2132                                (ulong)ents[cur]);
2133                    }
2134
2135                    ents[cur]->flags &= ~CE_BUSY;
2136                    if (ents[cur]->data)
2137                        free(ents[cur]->data);
2138                    free(ents[cur]);
2139                    ents[cur] = NULL;
2140
2141                    bc.cur_blocks--;
2142                }
2143
2144                if (cur < real_nblocks) {
2145                    LOCK(bc.lock);
2146                    for(; cur < real_nblocks; cur++) {
2147                        ents[cur]->flags &= ~CE_BUSY;
2148
2149                        /* we have to put them back here */
2150                        add_to_tail(&bc.normal, ents[cur]);
2151                    }
2152                    UNLOCK(bc.lock);
2153                }
2154
2155                return FS_ENOMEM;
2156            }
2157
2158
2159            /*
2160               If we go into this if statement, the block cache lock
2161               has *already been released* up above when we flushed the
2162               dirty entries.  As always, since the blocks we're mucking
2163               with are marked busy, they shouldn't get messed with.
2164            */
2165            err = 0;
2166            if (num_dirty || (op & CACHE_READ)) {
2167                /* this section performs the i/o that we need to do */
2168                if (op & CACHE_READ) {
2169                    err = read_into_ents(dev, bnum, ents, num_needed, bsize);
2170                } else {
2171                    err = 0;
2172                }
2173
2174                if (err != 0) {
2175                    printf("err %s on dev %d block %Ld:%d (%d) "
2176                           "data %p, ents[0] %p\n",
2177                           strerror(errno), dev, bnum, cur_nblocks,
2178                           bsize, data, ents[0]);
2179                }
2180
2181                /*
2182                   acquire the semaphore here so that we can go on mucking
2183                   with the cache data structures.  We need to delete old
2184                   block numbers from the hash table and set the new block
2185                   number's for the blocks we just read in.  We also put the
2186                   read-ahead blocks at the head of mru list.
2187                */
2188
2189                LOCK(bc.lock);
2190            }
2191
2192            for(cur=0; cur < num_needed; cur++) {
2193                cache_ent *tmp_ce;
2194
2195                ce = ents[cur];
2196                if (ce->dev != -1) {
2197                    tmp_ce = hash_delete(&bc.ht, ce->dev, ce->block_num);
2198                    if (tmp_ce == NULL || tmp_ce != ce) {
2199                        panic("*** hash_delete failure (ce 0x%x tce 0x%x)\n",
2200                              ce, tmp_ce);
2201                    }
2202                }
2203
2204                if (err == 0 && cur >= cur_nblocks) {
2205                    ce->dev       = dev;
2206                    ce->block_num = bnum + cur;
2207                    ce->flags    &= ~CE_BUSY;
2208                    add_to_head(&bc.normal, ce);
2209                }
2210            }
2211            ce = NULL;
2212
2213            /*
2214              clear the busy bit on the blocks we force-flushed and
2215              put them on the normal list since they're now clean.
2216            */
2217            for(; cur < real_nblocks; cur++) {
2218                ents[cur]->flags &= ~CE_BUSY;
2219
2220                if (ents[cur]->lock)
2221                    panic("should not have locked blocks here (ce 0x%x)\n",
2222                          ents[cur]);
2223
2224                add_to_tail(&bc.normal, ents[cur]);
2225            }
2226
2227            if (err) {   /* then we have some cleanup to do */
2228                for(cur=0; cur < num_needed; cur++) {
2229                    cache_ent *tmp_ce;
2230
2231                    /* we delete all blocks from the cache so we don't
2232                       leave partially written blocks in the cache */
2233
2234                    tmp_ce = (cache_ent *)hash_delete(&bc.ht,dev,bnum+cur);
2235                    if (tmp_ce != ents[cur]) {
2236                        panic("hash_del: %d %ld got 0x%lx, not 0x%lx\n",
2237                                dev, bnum+cur, (ulong)tmp_ce,
2238                                (ulong)ents[cur]);
2239                    }
2240
2241                    ce = ents[cur];
2242                    ce->flags &= ~CE_BUSY;
2243
2244                    free(ce->data);
2245                    ce->data = NULL;
2246
2247                    free(ce);
2248                    ents[cur] = NULL;
2249
2250                    bc.cur_blocks--;
2251                }
2252                ce = NULL;
2253
2254                UNLOCK(bc.lock);
2255                return err;
2256            }
2257
2258
2259            /*
2260               last step: go through and make sure all the cache_ent
2261               structures have the right data in them, delete old guys, etc.
2262            */
2263            for(cur=0; cur < cur_nblocks; cur++) {
2264                ce = ents[cur];
2265
2266                if (ce->dev != -1) {   /* then clean this guy up */
2267                    if (ce->next || ce->prev)
2268                        panic("ce @ 0x%x should not be in a list yet!\n", ce);
2269
2270                    if (ce->clone)
2271                        free(ce->clone);
2272
2273                    if (ce->data == NULL)
2274                        panic("ce @ 0x%lx has a null data ptr\n", (ulong)ce);
2275                }
2276
2277                ce->dev        = dev;
2278                ce->block_num  = bnum + cur;
2279                ce->bsize      = bsize;
2280                ce->flags      = CE_NORMAL;
2281                ce->lock       = 0;
2282                ce->clone      = NULL;
2283                ce->func = ce->arg  = NULL;
2284                ce->next = ce->prev = NULL;
2285
2286                if (op & CACHE_READ) {
2287                    if (data)
2288                        memcpy(data, ce->data, bsize);
2289                } else if (op & CACHE_WRITE) {
2290                    ce->flags  |= CE_DIRTY;
2291                    memcpy(ce->data, data, bsize);
2292                } else if (op & CACHE_NOOP) {
2293                    memset(ce->data, 0, bsize);
2294                    if (data)
2295                        memset(data, 0, bsize);
2296
2297                    ce->flags |= CE_DIRTY;
2298                }
2299
2300                if (op & CACHE_LOCKED) {
2301                    ce->lock++;
2302                    cel = &bc.locked;
2303                } else {
2304                    cel = &bc.normal;
2305                }
2306
2307                /* now stick this puppy at the head of the mru list */
2308                add_to_head(cel, ce);
2309
2310
2311                if (dataptr) {
2312                    *dataptr = ce->data;
2313                }
2314
2315                if (data != NULL)
2316                    data = (void *)((char *)data + bsize);
2317                else if (cur_nblocks != 1)
2318                    panic("cache can't handle setting data_ptr twice!\n");
2319            }  /* end of for(cur=0; cur < cur_nblocks; cur++) */
2320
2321            bnum       += cur_nblocks;
2322            num_blocks -= cur_nblocks;
2323
2324        }   /* end of else it's not in the cache */
2325
2326    }   /* end of while(num_blocks) */
2327
2328    UNLOCK(bc.lock);
2329
2330    return 0;
2331}
2332
2333
2334void *
2335get_block(int dev, fs_off_t bnum, int bsize)
2336{
2337    void *data;
2338
2339    if (cache_block_io(dev, bnum, NULL, 1, bsize, CACHE_READ|CACHE_LOCKED|CACHE_READ_AHEAD_OK,
2340                       &data) != 0)
2341        return NULL;
2342
2343    return data;
2344}
2345
2346void *
2347get_empty_block(int dev, fs_off_t bnum, int bsize)
2348{
2349    void *data;
2350
2351    if (cache_block_io(dev, bnum, NULL, 1, bsize, CACHE_NOOP|CACHE_LOCKED,
2352                       &data) != 0)
2353        return NULL;
2354
2355    return data;
2356}
2357
2358int
2359cached_read(int dev, fs_off_t bnum, void *data, fs_off_t num_blocks, int bsize)
2360{
2361    return cache_block_io(dev, bnum, data, num_blocks, bsize,
2362                          CACHE_READ | CACHE_READ_AHEAD_OK, NULL);
2363}
2364
2365
2366int
2367cached_write(int dev, fs_off_t bnum, const void *data, fs_off_t num_blocks,int bsize)
2368{
2369    return cache_block_io(dev, bnum, (void *)data, num_blocks, bsize,
2370                          CACHE_WRITE, NULL);
2371}
2372
2373int
2374cached_write_locked(int dev, fs_off_t bnum, const void *data,
2375                    fs_off_t num_blocks, int bsize)
2376{
2377    return cache_block_io(dev, bnum, (void *)data, num_blocks, bsize,
2378                          CACHE_WRITE | CACHE_LOCKED, NULL);
2379}
2380
2381
2382void
2383force_cache_flush(int dev, int prefer_log_blocks)
2384{
2385    int        i, count = 0;
2386    cache_ent *ce;
2387    cache_ent *ents[NUM_FLUSH_BLOCKS];
2388
2389
2390    LOCK(bc.lock);
2391
2392    for(ce=bc.normal.lru; ce; ce=ce->next) {
2393        if ((ce->dev == dev) &&
2394            (ce->flags & CE_BUSY) == 0 &&
2395            ((ce->flags & CE_DIRTY) || ce->clone) &&
2396            ((prefer_log_blocks && ce->func) || (prefer_log_blocks == 0))) {
2397
2398            ce->flags |= CE_BUSY;
2399            ents[count++] = ce;
2400
2401            if (count >= NUM_FLUSH_BLOCKS) {
2402                break;
2403            }
2404        }
2405    }
2406
2407    /* if we've got some room left, try and grab any cloned blocks */
2408    if (count < NUM_FLUSH_BLOCKS) {
2409        for(ce=bc.locked.lru; ce; ce=ce->next) {
2410            if ((ce->dev == dev) &&
2411                (ce->flags & CE_BUSY) == 0 &&
2412                (ce->clone)) {
2413
2414                ce->flags |= CE_BUSY;
2415                ents[count++] = ce;
2416
2417                if (count >= NUM_FLUSH_BLOCKS) {
2418                    break;
2419                }
2420            }
2421        }
2422    }
2423
2424    UNLOCK(bc.lock);
2425
2426    if (count != 0) {
2427        qsort(ents, count, sizeof(cache_ent **), cache_ent_cmp);
2428        flush_ents(ents, count);
2429
2430        for(i=0; i < count; i++)
2431            ents[i]->flags &= ~CE_BUSY;
2432    }
2433}
2434