/* This file contains the global device cache for the BeOS. All file system I/O comes through here. The cache can handle blocks of different sizes for multiple different underlying physical devices. The cache is organized as a hash table (for lookups by device and block number) and two doubly-linked lists. The normal list is for "normal" blocks which are either clean or dirty. The locked list is for blocks that are locked in the cache by BFS. The lists are LRU ordered. Most of the work happens in the function cache_block_io() which is quite lengthy. The other functions of interest are get_ents() which picks victims to be kicked out of the cache; flush_ents() which does the work of flushing them; and set_blocks_info() which handles cloning blocks and setting callbacks so that the BFS journal will work properly. If you want to modify this code it will take some study but it's not too bad. Do not think about separating the list of clean and dirty blocks into two lists as I did that already and it's slower. Originally this cache code was written while listening to the album "Ride the Lightning" by Metallica. The current version was written while listening to "Welcome to SkyValley" by Kyuss as well as an ambient collection on the German label FAX. Music helps a lot when writing code. THIS CODE COPYRIGHT DOMINIC GIAMPAOLO. NO WARRANTY IS EXPRESSED OR IMPLIED. YOU MAY USE THIS CODE AND FREELY DISTRIBUTE IT FOR NON-COMMERCIAL USE AS LONG AS THIS NOTICE REMAINS ATTACHED. FOR COMMERCIAL USE, CONTACT DOMINIC GIAMPAOLO (dbg@be.com). Dominic Giampaolo dbg@be.com */ #include #include #include #include #include #include #include #include #if (defined(__BEOS__) || defined(__HAIKU__)) #include #include #endif #include "compat.h" #include "lock.h" #include "cache.h" #ifndef USER #define printf dprintf #endif #ifdef USER #define kprintf printf #endif /* forward prototypes */ static int flush_ents(cache_ent **ents, int n_ents); //static int do_dump(int argc, char **argv); //static int do_find_block(int argc, char **argv); //static int do_find_data(int argc, char **argv); //static void cache_flusher(void *arg, int phase); int chatty_io = 0; #define CHUNK (512 * 1024) /* a hack to work around scsi driver bugs */ size_t read_phys_blocks(int fd, fs_off_t bnum, void *data, uint num_blocks, int bsize) { size_t ret = 0; size_t sum; if (chatty_io) printf("R: %8Ld : %3d\n", bnum, num_blocks); if (num_blocks * bsize < CHUNK) ret = read_pos(fd, bnum * bsize, data, num_blocks * bsize); else { for(sum=0; (sum + CHUNK) <= (num_blocks * bsize); sum += CHUNK) { ret = read_pos(fd, (bnum * bsize) + sum, data, CHUNK); if (ret != CHUNK) break; data = (void *)((char *)data + CHUNK); } if (ret == CHUNK && ((num_blocks * bsize) - sum) > 0) { ret = read_pos(fd, (bnum * bsize) + sum, data, (num_blocks * bsize) - sum); if (ret == (num_blocks * bsize) - sum) ret = num_blocks * bsize; } else if (ret == CHUNK) { ret = num_blocks * bsize; } } if (ret == num_blocks * bsize) return 0; else return EBADF; } size_t write_phys_blocks(int fd, fs_off_t bnum, void *data, uint num_blocks, int bsize) { size_t ret = 0; size_t sum; if (chatty_io) printf("W: %8Ld : %3d\n", bnum, num_blocks); if (num_blocks * bsize < CHUNK) ret = write_pos(fd, bnum * bsize, data, num_blocks * bsize); else { for(sum=0; (sum + CHUNK) <= (num_blocks * bsize); sum += CHUNK) { ret = write_pos(fd, (bnum * bsize) + sum, data, CHUNK); if (ret != CHUNK) break; data = (void *)((char *)data + CHUNK); } if (ret == CHUNK && ((num_blocks * bsize) - sum) > 0) { ret = write_pos(fd, (bnum * bsize) + sum, data, (num_blocks * bsize) - sum); if (ret == (num_blocks * bsize) - sum) ret = num_blocks * bsize; } else if (ret == CHUNK) { ret = num_blocks * bsize; } } if (ret == num_blocks * bsize) return 0; else return EBADF; } // #pragma mark - static int init_hash_table(hash_table *ht) { ht->max = HT_DEFAULT_MAX; ht->mask = ht->max - 1; ht->num_elements = 0; ht->table = (hash_ent **)calloc(ht->max, sizeof(hash_ent *)); if (ht->table == NULL) return ENOMEM; return 0; } static void shutdown_hash_table(hash_table *ht) { int i, hash_len; hash_ent *he, *next; for(i=0; i < ht->max; i++) { he = ht->table[i]; for(hash_len=0; he; hash_len++, he=next) { next = he->next; free(he); } } if (ht->table) free(ht->table); ht->table = NULL; } #if 0 static void print_hash_stats(hash_table *ht) { int i, hash_len, max = -1, sum = 0; hash_ent *he, *next; for(i=0; i < ht->max; i++) { he = ht->table[i]; for(hash_len=0; he; hash_len++, he=next) { next = he->next; } if (hash_len) printf("bucket %3d : %3d\n", i, hash_len); sum += hash_len; if (hash_len > max) max = hash_len; } printf("max # of chains: %d, average chain length %d\n", max,sum/ht->max); } #endif #define HASH(d, b) ((((fs_off_t)d) << (sizeof(fs_off_t)*8 - 6)) | (b)) static hash_ent * new_hash_ent(int dev, fs_off_t bnum, void *data) { hash_ent *he; he = (hash_ent *)malloc(sizeof(*he)); if (he == NULL) return NULL; he->hash_val = HASH(dev, bnum); he->dev = dev; he->bnum = bnum; he->data = data; he->next = NULL; return he; } static int grow_hash_table(hash_table *ht) { int i, omax, newsize, newmask; fs_off_t hash; hash_ent **new_table, *he, *next; if (ht->max & ht->mask) { printf("*** hashtable size %d or mask %d looks weird!\n", ht->max, ht->mask); } omax = ht->max; newsize = omax * 2; /* have to grow in powers of two */ newmask = newsize - 1; new_table = (hash_ent **)calloc(newsize, sizeof(hash_ent *)); if (new_table == NULL) return ENOMEM; for(i=0; i < omax; i++) { for(he=ht->table[i]; he; he=next) { hash = he->hash_val & newmask; next = he->next; he->next = new_table[hash]; new_table[hash] = he; } } free(ht->table); ht->table = new_table; ht->max = newsize; ht->mask = newmask; return 0; } static int hash_insert(hash_table *ht, int dev, fs_off_t bnum, void *data) { fs_off_t hash; hash_ent *he, *curr; hash = HASH(dev, bnum) & ht->mask; curr = ht->table[hash]; for(; curr != NULL; curr=curr->next) if (curr->dev == dev && curr->bnum == bnum) break; if (curr && curr->dev == dev && curr->bnum == bnum) { printf("entry %d:%Ld already in the hash table!\n", dev, bnum); return EEXIST; } he = new_hash_ent(dev, bnum, data); if (he == NULL) return ENOMEM; he->next = ht->table[hash]; ht->table[hash] = he; ht->num_elements++; if (ht->num_elements >= ((ht->max * 3) / 4)) { if (grow_hash_table(ht) != 0) return ENOMEM; } return 0; } static void * hash_lookup(hash_table *ht, int dev, fs_off_t bnum) { hash_ent *he; he = ht->table[HASH(dev, bnum) & ht->mask]; for(; he != NULL; he=he->next) { if (he->dev == dev && he->bnum == bnum) break; } if (he) return he->data; else return NULL; } static void * hash_delete(hash_table *ht, int dev, fs_off_t bnum) { void *data; fs_off_t hash; hash_ent *he, *prev = NULL; hash = HASH(dev, bnum) & ht->mask; he = ht->table[hash]; for(; he != NULL; prev=he,he=he->next) { if (he->dev == dev && he->bnum == bnum) break; } if (he == NULL) { printf("*** hash_delete: tried to delete non-existent block %d:%Ld\n", dev, bnum); return NULL; } data = he->data; if (ht->table[hash] == he) ht->table[hash] = he->next; else if (prev) prev->next = he->next; else panic("hash table is inconsistent\n"); free(he); ht->num_elements--; return data; } // #pragma mark - /* These are the global variables for the cache. */ static block_cache bc; #define MAX_IOVECS 64 /* # of iovecs for use by cache code */ static lock iovec_lock; static struct iovec *iovec_pool[MAX_IOVECS]; /* each ptr is to an array of iovecs */ static int iovec_used[MAX_IOVECS]; /* non-zero == iovec is in use */ #define NUM_FLUSH_BLOCKS 64 /* size of the iovec array pointed by each ptr */ #define DEFAULT_READ_AHEAD_SIZE (32 * 1024) static int read_ahead_size = DEFAULT_READ_AHEAD_SIZE; /* this array stores the size of each device so we can error check requests */ #define MAX_DEVICES 256 fs_off_t max_device_blocks[MAX_DEVICES]; /* has the time of the last cache access so cache flushing doesn't interfere */ static bigtime_t last_cache_access = 0; int init_block_cache(int max_blocks, int flags) { memset(&bc, 0, sizeof(bc)); memset(iovec_pool, 0, sizeof(iovec_pool)); memset(iovec_used, 0, sizeof(iovec_used)); memset(&max_device_blocks, 0, sizeof(max_device_blocks)); if (init_hash_table(&bc.ht) != 0) return ENOMEM; bc.lock.s = iovec_lock.s = -1; bc.max_blocks = max_blocks; bc.flags = flags; if (new_lock(&bc.lock, "bollockcache") != 0) goto err; if (new_lock(&iovec_lock, "iovec_lock") != 0) goto err; /* allocate two of these up front so vm won't accidently re-enter itself */ iovec_pool[0] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS); iovec_pool[1] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS); #ifndef USER #ifdef DEBUG add_debugger_command("bcache", do_dump, "dump the block cache list"); add_debugger_command("fblock", do_find_block, "find a block in the cache"); add_debugger_command("fdata", do_find_data, "find a data block ptr in the cache"); #endif register_kernel_daemon(cache_flusher, NULL, 3); #endif return 0; err: if (bc.lock.s >= 0) free_lock(&bc.lock); if (iovec_lock.s >= 0) free_lock(&iovec_lock); shutdown_hash_table(&bc.ht); memset((void *)&bc, 0, sizeof(bc)); return ENOMEM; } static struct iovec * get_iovec_array(void) { int i; struct iovec *iov; LOCK(iovec_lock); for(i=0; i < MAX_IOVECS; i++) { if (iovec_used[i] == 0) break; } if (i >= MAX_IOVECS) /* uh-oh */ panic("cache: ran out of iovecs (pool 0x%x, used 0x%x)!\n", &iovec_pool[0], &iovec_used[0]); if (iovec_pool[i] == NULL) { iovec_pool[i] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS); if (iovec_pool == NULL) panic("can't allocate an iovec!\n"); } iov = iovec_pool[i]; iovec_used[i] = 1; UNLOCK(iovec_lock); return iov; } static void release_iovec_array(struct iovec *iov) { int i; LOCK(iovec_lock); for(i=0; i < MAX_IOVECS; i++) { if (iov == iovec_pool[i]) break; } if (i < MAX_IOVECS) iovec_used[i] = 0; else /* uh-oh */ printf("cache: released an iovec I don't own (iov %p)\n", iov); UNLOCK(iovec_lock); } static void real_dump_cache_list(cache_ent_list *cel) { cache_ent *ce; kprintf("starting from LRU end:\n"); for (ce = cel->lru; ce; ce = ce->next) { kprintf("ce %p dev %2d bnum %6Ld lock %d flag %d arg %p " "clone %p\n", ce, ce->dev, ce->block_num, ce->lock, ce->flags, ce->arg, ce->clone); } kprintf("MRU end\n"); } static void dump_cache_list(void) { kprintf("NORMAL BLOCKS\n"); real_dump_cache_list(&bc.normal); kprintf("LOCKED BLOCKS\n"); real_dump_cache_list(&bc.locked); kprintf("cur blocks %d, max blocks %d ht @ %p\n", bc.cur_blocks, bc.max_blocks, &bc.ht); } #if 0 static void check_bcache(char *str) { int count = 0; cache_ent *ce, *prev = NULL; LOCK(bc.lock); for(ce=bc.normal.lru; ce; prev=ce, ce=ce->next) { count++; } for(ce=bc.locked.lru; ce; prev=ce, ce=ce->next) { count++; } if (count != bc.cur_blocks) { if (count < bc.cur_blocks - 16) panic("%s: count == %d, cur_blocks %d, prev 0x%x\n", str, count, bc.cur_blocks, prev); else printf("%s: count == %d, cur_blocks %d, prev 0x%x\n", str, count, bc.cur_blocks, prev); } UNLOCK(bc.lock); } static void dump_lists(void) { cache_ent *nce; printf("LOCKED 0x%x (tail 0x%x, head 0x%x)\n", &bc.locked, bc.locked.lru, bc.locked.mru); for(nce=bc.locked.lru; nce; nce=nce->next) printf("nce @ 0x%x dev %d bnum %ld flags %d lock %d clone 0x%x func 0x%x\n", nce, nce->dev, nce->block_num, nce->flags, nce->lock, nce->clone, nce->func); printf("NORMAL 0x%x (tail 0x%x, head 0x%x)\n", &bc.normal, bc.normal.lru, bc.normal.mru); for(nce=bc.normal.lru; nce; nce=nce->next) printf("nce @ 0x%x dev %d bnum %ld flags %d lock %d clone 0x%x func 0x%x\n", nce, nce->dev, nce->block_num, nce->flags, nce->lock, nce->clone, nce->func); } static void check_lists(void) { cache_ent *ce, *prev, *oce; cache_ent_list *cel; cel = &bc.normal; for(ce=cel->lru,prev=NULL; ce; prev=ce, ce=ce->next) { for(oce=bc.locked.lru; oce; oce=oce->next) { if (oce == ce) { dump_lists(); panic("1:ce @ 0x%x is in two lists(cel 0x%x &LOCKED)\n",ce,cel); } } } if (prev && prev != cel->mru) { dump_lists(); panic("*** last element in list != cel mru (ce 0x%x, cel 0x%x)\n", prev, cel); } cel = &bc.locked; for(ce=cel->lru,prev=NULL; ce; prev=ce, ce=ce->next) { for(oce=bc.normal.lru; oce; oce=oce->next) { if (oce == ce) { dump_lists(); panic("3:ce @ 0x%x is in two lists(cel 0x%x & DIRTY)\n",ce,cel); } } } if (prev && prev != cel->mru) { dump_lists(); panic("*** last element in list != cel mru (ce 0x%x, cel 0x%x)\n", prev, cel); } } #endif #ifdef DEBUG #if 0 static int do_dump(int argc, char **argv) { dump_cache_list(); return 1; } static int do_find_block(int argc, char **argv) { int i; fs_off_t bnum; cache_ent *ce; if (argc < 2) { kprintf("%s: needs a block # argument\n", argv[0]); return 1; } for(i=1; i < argc; i++) { bnum = strtoul(argv[i], NULL, 0); for(ce=bc.normal.lru; ce; ce=ce->next) { if (ce->block_num == bnum) { kprintf("found clean bnum %ld @ 0x%lx (data @ 0x%lx)\n", bnum, ce, ce->data); } } for(ce=bc.locked.lru; ce; ce=ce->next) { if (ce->block_num == bnum) { kprintf("found locked bnum %ld @ 0x%lx (data @ 0x%lx)\n", bnum, ce, ce->data); } } } return 0; } static int do_find_data(int argc, char **argv) { int i; void *data; cache_ent *ce; if (argc < 2) { kprintf("%s: needs a block # argument\n", argv[0]); return 1; } for(i=1; i < argc; i++) { data = (void *)strtoul(argv[i], NULL, 0); for(ce=bc.normal.lru; ce; ce=ce->next) { if (ce->data == data) { kprintf("found normal data ptr for bnum %ld @ ce 0x%lx\n", ce->block_num, ce); } } for(ce=bc.locked.lru; ce; ce=ce->next) { if (ce->data == data) { kprintf("found locked data ptr for bnum %ld @ ce 0x%lx\n", ce->block_num, ce); } } } return 0; } #endif #endif /* DEBUG */ /* this function detaches the cache_ent from the list. */ static void delete_from_list(cache_ent_list *cel, cache_ent *ce) { if (ce->next) ce->next->prev = ce->prev; if (ce->prev) ce->prev->next = ce->next; if (cel->lru == ce) cel->lru = ce->next; if (cel->mru == ce) cel->mru = ce->prev; ce->next = NULL; ce->prev = NULL; } /* this function adds the cache_ent ce to the head of the list (i.e. the MRU end). the cache_ent should *not* be in any lists. */ static void add_to_head(cache_ent_list *cel, cache_ent *ce) { if (ce->next != NULL || ce->prev != NULL) { panic("*** ath: ce has non-null next/prev ptr (ce 0x%x nxt 0x%x, prv 0x%x)\n", ce, ce->next, ce->prev); } ce->next = NULL; ce->prev = cel->mru; if (cel->mru) cel->mru->next = ce; cel->mru = ce; if (cel->lru == NULL) cel->lru = ce; } /* this function adds the cache_ent ce to the tail of the list (i.e. the MRU end). the cache_ent should *not* be in any lists. */ static void add_to_tail(cache_ent_list *cel, cache_ent *ce) { if (ce->next != NULL || ce->prev != NULL) { panic("*** att: ce has non-null next/prev ptr (ce 0x%x nxt 0x%x, prv 0x%x)\n", ce, ce->next, ce->prev); } ce->next = cel->lru; ce->prev = NULL; if (cel->lru) cel->lru->prev = ce; cel->lru = ce; if (cel->mru == NULL) cel->mru = ce; } static int cache_ent_cmp(const void *a, const void *b) { fs_off_t diff; cache_ent *p1 = *(cache_ent **)a, *p2 = *(cache_ent **)b; if (p1 == NULL || p2 == NULL) panic("cache_ent pointers are null?!? (a 0x%lx, b 0x%lx\n)\n", a, b); if (p1->dev == p2->dev) { diff = p1->block_num - p2->block_num; return (int)diff; } else { return p1->dev - p2->dev; } } #if 0 // ToDo: add this to the fsh (as a background thread)? static void cache_flusher(void *arg, int phase) { int i, num_ents, err; bigtime_t now = system_time(); static cache_ent *ce = NULL; static cache_ent *ents[NUM_FLUSH_BLOCKS]; /* if someone else was in the cache recently then just bail out so we don't lock them out unnecessarily */ if ((now - last_cache_access) < 1000000) return; LOCK(bc.lock); ce = bc.normal.lru; for(num_ents=0; ce && num_ents < NUM_FLUSH_BLOCKS; ce=ce->next) { if (ce->flags & CE_BUSY) continue; if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL) continue; ents[num_ents] = ce; ents[num_ents]->flags |= CE_BUSY; num_ents++; } /* if we've got some room left over, look for cloned locked blocks */ if (num_ents < NUM_FLUSH_BLOCKS) { ce = bc.locked.lru; for(; num_ents < NUM_FLUSH_BLOCKS;) { for(; ce && ((ce->flags & CE_BUSY) || ce->clone == NULL); ce=ce->next) /* skip ents that meet the above criteria */; if (ce == NULL) break; ents[num_ents] = ce; ents[num_ents]->flags |= CE_BUSY; ce = ce->next; num_ents++; } } UNLOCK(bc.lock); if (num_ents == 0) return; qsort(ents, num_ents, sizeof(cache_ent **), cache_ent_cmp); if ((err = flush_ents(ents, num_ents)) != 0) { printf("flush ents failed (ents @ 0x%lx, num_ents %d!\n", (ulong)ents, num_ents); } for(i=0; i < num_ents; i++) { /* clear the busy bit on each of ent */ ents[i]->flags &= ~CE_BUSY; } } #endif static int flush_cache_ent(cache_ent *ce) { int ret = 0; void *data; /* if true, then there's nothing to flush */ if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL) return 0; /* same thing here */ if (ce->clone == NULL && ce->lock != 0) return 0; restart: if (ce->clone) data = ce->clone; else data = ce->data; if (chatty_io > 2) printf("flush: %7Ld\n", ce->block_num); ret = write_phys_blocks(ce->dev, ce->block_num, data, 1, ce->bsize); if (ce->func) { ce->func(ce->logged_bnum, 1, ce->arg); ce->func = NULL; } if (ce->clone) { free(ce->clone); ce->clone = NULL; if (ce->lock == 0 && (ce->flags & CE_DIRTY)) goto restart; /* also write the real data ptr */ } else { ce->flags &= ~CE_DIRTY; } return ret; } static int flush_ents(cache_ent **ents, int n_ents) { int i, j, k, ret = 0, bsize, iocnt, do_again = 0; fs_off_t start_bnum; struct iovec *iov; iov = get_iovec_array(); if (iov == NULL) return ENOMEM; restart: for(i=0; i < n_ents; i++) { /* if true, then there's nothing to flush */ if ((ents[i]->flags & CE_DIRTY) == 0 && ents[i]->clone == NULL) continue; /* if true we can't touch the dirty data yet because it's locked */ if (ents[i]->clone == NULL && ents[i]->lock != 0) continue; bsize = ents[i]->bsize; start_bnum = ents[i]->block_num; for(j=i+1; j < n_ents && (j - i) < NUM_FLUSH_BLOCKS; j++) { if (ents[j]->dev != ents[i]->dev || ents[j]->block_num != start_bnum + (j - i)) break; if (ents[j]->clone == NULL && ents[j]->lock != 0) break; } if (j == i+1) { /* only one block, just flush it directly */ if ((ret = flush_cache_ent(ents[i])) != 0) break; continue; } for(k=i,iocnt=0; k < j; k++,iocnt++) { if (ents[k]->clone) iov[iocnt].iov_base = ents[k]->clone; else iov[iocnt].iov_base = ents[k]->data; iov[iocnt].iov_len = bsize; } if (chatty_io) printf("writev @ %Ld for %d blocks\n", start_bnum, iocnt); ret = writev_pos(ents[i]->dev, start_bnum * (fs_off_t)bsize, &iov[0], iocnt); if (ret != iocnt*bsize) { int idx; printf("flush_ents: writev failed: iocnt %d start bnum %Ld " "bsize %d, ret %d\n", iocnt, start_bnum, bsize, ret); for(idx=0; idx < iocnt; idx++) printf("iov[%2d] = %p :: %ld\n", idx, iov[idx].iov_base, iov[idx].iov_len); printf("error %s writing blocks %Ld:%d (%d != %d)\n", strerror(errno), start_bnum, iocnt, ret, iocnt*bsize); ret = EINVAL; break; } ret = 0; for(k=i; k < j; k++) { if (ents[k]->func) { ents[k]->func(ents[k]->logged_bnum, 1, ents[k]->arg); ents[k]->func = NULL; } if (ents[k]->clone) { free(ents[k]->clone); ents[k]->clone = NULL; } else { ents[k]->flags &= ~CE_DIRTY; } } i = j - 1; /* i gets incremented by the outer for loop */ } /* here we have to go back through and flush any blocks that are still dirty. with an arched brow you astutely ask, "but how could this happen given the above loop?" Ahhh young grasshopper I say, the path through the cache is long and twisty and fraught with peril. The reason it can happen is that a block can be both cloned and dirty. The above loop would only flush the cloned half of the data, not the main dirty block. So we have to go back through and see if there are any blocks that are still dirty. If there are we go back to the top of the function and do the whole thing over. Kind of grody but it is necessary to insure the correctness of the log for the Be file system. */ if (do_again == 0) { for(i=0; i < n_ents; i++) { if ((ents[i]->flags & CE_DIRTY) == 0 || ents[i]->lock) continue; do_again = 1; break; } if (do_again) goto restart; } release_iovec_array(iov); return ret; } static void delete_cache_list(cache_ent_list *cel) { void *junk; cache_ent *ce, *next; for (ce = cel->lru; ce; ce = next) { next = ce->next; if (ce->lock != 0) { if (ce->func) printf("*** shutdown_block_cache: block %Ld, lock == %d " "(arg %p)!\n", ce->block_num, ce->lock, ce->arg); else printf("*** shutdown_block_cache: block %Ld, lock == %d!\n", ce->block_num, ce->lock); } if (ce->flags & CE_BUSY) { printf("* shutdown block cache: bnum %Ld is busy? ce %p\n", ce->block_num, ce); } if ((ce->flags & CE_DIRTY) || ce->clone) { flush_cache_ent(ce); } if (ce->clone) free(ce->clone); ce->clone = NULL; if (ce->data) free(ce->data); ce->data = NULL; if ((junk = hash_delete(&bc.ht, ce->dev, ce->block_num)) != ce) { printf("*** free_device_cache: bad hash table entry %Ld " "%p != %p\n", ce->block_num, junk, ce); } memset(ce, 0xfd, sizeof(*ce)); free(ce); bc.cur_blocks--; } } void shutdown_block_cache(void) { /* print_hash_stats(&bc.ht); */ if (bc.lock.s > 0) LOCK(bc.lock); #ifndef USER unregister_kernel_daemon(cache_flusher, NULL); #endif delete_cache_list(&bc.normal); delete_cache_list(&bc.locked); bc.normal.lru = bc.normal.mru = NULL; bc.locked.lru = bc.locked.mru = NULL; shutdown_hash_table(&bc.ht); if (bc.lock.s > 0) free_lock(&bc.lock); bc.lock.s = -1; if (iovec_lock.s >= 0) free_lock(&iovec_lock); } int init_cache_for_device(int fd, fs_off_t max_blocks) { int ret = 0; if (fd >= MAX_DEVICES) return -1; LOCK(bc.lock); if (max_device_blocks[fd] != 0) { printf("device %d is already initialized!\n", fd); ret = -1; } else { max_device_blocks[fd] = max_blocks; } UNLOCK(bc.lock); return ret; } /* this routine assumes that bc.lock has been acquired */ static cache_ent * block_lookup(int dev, fs_off_t bnum) { int count = 0; cache_ent *ce; while (1) { ce = hash_lookup(&bc.ht, dev, bnum); if (ce == NULL) return NULL; if ((ce->flags & CE_BUSY) == 0) /* it's ok, break out and return it */ break; /* else, it's busy and we need to retry our lookup */ UNLOCK(bc.lock); snooze(5000); if (count++ == 5000) { /* then a lot of time has elapsed */ printf("block %Ld isn't coming un-busy (ce @ %p)\n", ce->block_num, ce); } LOCK(bc.lock); } if (ce->flags & CE_BUSY) panic("block lookup: returning a busy block @ 0x%lx?!?\n",(ulong)ce); return ce; } int set_blocks_info(int dev, fs_off_t *blocks, int nblocks, void (*func)(fs_off_t bnum, size_t nblocks, void *arg), void *arg) { int i, j, cur; cache_ent *ce; cache_ent *ents[NUM_FLUSH_BLOCKS]; LOCK(bc.lock); for(i=0, cur=0; i < nblocks; i++) { /* printf("sbi: %ld (arg 0x%x)\n", blocks[i], arg); */ ce = block_lookup(dev, blocks[i]); if (ce == NULL) { panic("*** set_block_info can't find bnum %ld!\n", blocks[i]); UNLOCK(bc.lock); return ENOENT; /* hopefully this doesn't happen... */ } if (blocks[i] != ce->block_num || dev != ce->dev) { UNLOCK(bc.lock); panic("** error1: looked up dev %d block %ld but found dev %d " "bnum %ld\n", dev, blocks[i], ce->dev, ce->block_num); return EBADF; } if (ce->lock == 0) { panic("* set_block_info on bnum %ld (%d) but it's not locked!\n", blocks[i], nblocks); } if ((ce->flags & CE_DIRTY) == 0) { panic("*** set_block_info on non-dirty block bnum %ld (%d)!\n", blocks[i], nblocks); } ce->flags |= CE_BUSY; /* mark all blocks as busy till we're done */ /* if there is cloned data, it needs to be flushed now */ if (ce->clone && ce->func) { ents[cur++] = ce; if (cur >= NUM_FLUSH_BLOCKS) { UNLOCK(bc.lock); qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp); flush_ents(ents, cur); LOCK(bc.lock); for(j=0; j < cur; j++) ents[j]->flags &= ~CE_BUSY; cur = 0; } } } if (cur != 0) { UNLOCK(bc.lock); qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp); flush_ents(ents, cur); LOCK(bc.lock); for(j=0; j < cur; j++) ents[j]->flags &= ~CE_BUSY; cur = 0; } /* now go through and set the info that we were asked to */ for (i = 0; i < nblocks; i++) { /* we can call hash_lookup() here because we know it's around */ ce = hash_lookup(&bc.ht, dev, blocks[i]); if (ce == NULL) { panic("*** set_block_info can't find bnum %Ld!\n", blocks[i]); UNLOCK(bc.lock); return ENOENT; /* hopefully this doesn't happen... */ } ce->flags &= ~(CE_DIRTY | CE_BUSY); if (ce->func != NULL) { panic("*** set_block_info non-null callback on bnum %Ld\n", ce->block_num); } if (ce->clone != NULL) { panic("*** ce->clone == %p, not NULL in set_block_info\n", ce->clone); } ce->clone = (void *)malloc(ce->bsize); if (ce->clone == NULL) panic("*** can't clone bnum %Ld (bsize %d)\n", ce->block_num, ce->bsize); memcpy(ce->clone, ce->data, ce->bsize); ce->func = func; ce->arg = arg; ce->logged_bnum = blocks[i]; ce->lock--; if (ce->lock < 0) { printf("sbi: whoa nellie! ce @ %p (%Ld) has lock == %d\n", ce, ce->block_num, ce->lock); } if (ce->lock == 0) { delete_from_list(&bc.locked, ce); add_to_head(&bc.normal, ce); } } UNLOCK(bc.lock); return 0; } /* this function is only for use by flush_device() */ static void do_flush(cache_ent **ents, int max) { int i; for(i=0; i < max; i++) { ents[i]->flags |= CE_BUSY; } UNLOCK(bc.lock); qsort(ents, max, sizeof(cache_ent **), cache_ent_cmp); flush_ents(ents, max); LOCK(bc.lock); for(i=0; i < max; i++) { ents[i]->flags &= ~CE_BUSY; } } int flush_device(int dev, int warn_locked) { int cur; cache_ent *ce; cache_ent *ents[NUM_FLUSH_BLOCKS]; LOCK(bc.lock); cur = 0; ce = bc.normal.lru; while (ce) { if (ce->dev != dev || (ce->flags & CE_BUSY)) { ce = ce->next; continue; } if ((ce->flags & CE_DIRTY) || ce->clone) { ents[cur++] = ce; if (cur >= NUM_FLUSH_BLOCKS) { do_flush(ents, cur); ce = bc.normal.lru; cur = 0; continue; } } ce = ce->next; } if (cur != 0) do_flush(ents, cur); cur = 0; ce = bc.locked.lru; while (ce) { if (ce->dev != dev || (ce->flags & CE_BUSY)) { ce = ce->next; continue; } if (ce->clone) { ents[cur++] = ce; if (cur >= NUM_FLUSH_BLOCKS) { do_flush(ents, cur); ce = bc.locked.lru; cur = 0; continue; } } ce = ce->next; } if (cur != 0) do_flush(ents, cur); UNLOCK(bc.lock); return 0; } static void real_remove_cached_blocks(int dev, int allow_writes, cache_ent_list *cel) { void *junk; cache_ent *ce, *next = NULL; for(ce=cel->lru; ce; ce=next) { next = ce->next; if (ce->dev != dev) { continue; } if (ce->lock != 0 || (ce->flags & CE_BUSY)) { printf("*** remove_cached_dev: block %Ld has lock = %d, flags " "0x%x! ce @ 0x%lx\n", ce->block_num, ce->lock, ce->flags, (ulong)ce); } if (allow_writes == ALLOW_WRITES && ((ce->flags & CE_DIRTY) || ce->clone)) { ce->flags |= CE_BUSY; flush_cache_ent(ce); ce->flags &= ~CE_BUSY; } /* unlink this guy */ if (cel->lru == ce) cel->lru = ce->next; if (cel->mru == ce) cel->mru = ce->prev; if (ce->prev) ce->prev->next = ce->next; if (ce->next) ce->next->prev = ce->prev; if (ce->clone) free(ce->clone); ce->clone = NULL; if (ce->data) free(ce->data); ce->data = NULL; if ((junk = hash_delete(&bc.ht, ce->dev, ce->block_num)) != ce) { panic("*** remove_cached_device: bad hash table entry %ld " "0x%lx != 0x%lx\n", ce->block_num, (ulong)junk, (ulong)ce); } free(ce); bc.cur_blocks--; } } int remove_cached_device_blocks(int dev, int allow_writes) { LOCK(bc.lock); real_remove_cached_blocks(dev, allow_writes, &bc.normal); real_remove_cached_blocks(dev, allow_writes, &bc.locked); max_device_blocks[dev] = 0; UNLOCK(bc.lock); return 0; } int flush_blocks(int dev, fs_off_t bnum, int nblocks) { int cur, i; cache_ent *ce; cache_ent *ents[NUM_FLUSH_BLOCKS]; if (nblocks == 0) /* might as well check for this */ return 0; LOCK(bc.lock); cur = 0; for(; nblocks > 0; nblocks--, bnum++) { ce = block_lookup(dev, bnum); if (ce == NULL) continue; if (bnum != ce->block_num || dev != ce->dev) { UNLOCK(bc.lock); panic("error2: looked up dev %d block %ld but found %d %ld\n", dev, bnum, ce->dev, ce->block_num); return EBADF; } if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL) continue; ce->flags |= CE_BUSY; ents[cur++] = ce; if (cur >= NUM_FLUSH_BLOCKS) { UNLOCK(bc.lock); qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp); flush_ents(ents, cur); LOCK(bc.lock); for(i=0; i < cur; i++) { ents[i]->flags &= ~CE_BUSY; } cur = 0; } } UNLOCK(bc.lock); if (cur == 0) /* nothing more to do */ return 0; /* flush out the last few buggers */ qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp); flush_ents(ents, cur); for(i=0; i < cur; i++) { ents[i]->flags &= ~CE_BUSY; } return 0; } int mark_blocks_dirty(int dev, fs_off_t bnum, int nblocks) { int ret = 0; cache_ent *ce; LOCK(bc.lock); while(nblocks > 0) { ce = block_lookup(dev, bnum); if (ce) { ce->flags |= CE_DIRTY; bnum += 1; nblocks -= 1; } else { /* hmmm, that's odd, didn't find it */ printf("** mark_blocks_diry couldn't find block %Ld (len %d)\n", bnum, nblocks); ret = ENOENT; break; } } UNLOCK(bc.lock); return ret; } int release_block(int dev, fs_off_t bnum) { cache_ent *ce; /* printf("rlsb: %ld\n", bnum); */ LOCK(bc.lock); ce = block_lookup(dev, bnum); if (ce) { if (bnum != ce->block_num || dev != ce->dev) { panic("*** error3: looked up dev %d block %ld but found %d %ld\n", dev, bnum, ce->dev, ce->block_num); UNLOCK(bc.lock); return EBADF; } ce->lock--; if (ce->lock < 0) { printf("rlsb: whoa nellie! ce %Ld has lock == %d\n", ce->block_num, ce->lock); } if (ce->lock == 0) { delete_from_list(&bc.locked, ce); add_to_head(&bc.normal, ce); } } else { /* hmmm, that's odd, didn't find it */ panic("** release_block asked to find %ld but it's not here\n", bnum); } UNLOCK(bc.lock); return 0; } static cache_ent * new_cache_ent(int bsize) { cache_ent *ce; ce = (cache_ent *)calloc(1, sizeof(cache_ent)); if (ce == NULL) { panic("*** error: cache can't allocate memory!\n"); return NULL; } ce->data = malloc(bsize); if (ce->data == NULL) { free(ce); panic("** error cache can't allocate data memory\n"); UNLOCK(bc.lock); return NULL; } ce->dev = -1; ce->block_num = -1; return ce; } static void get_ents(cache_ent **ents, int num_needed, int max, int *num_gotten, int bsize) { int cur, retry_counter = 0, max_retry = num_needed * 256; cache_ent *ce; if (num_needed > max) panic("get_ents: num_needed %d but max %d (doh!)\n", num_needed, max); /* if the cache isn't full yet, just allocate the blocks */ for(cur=0; bc.cur_blocks < bc.max_blocks && cur < num_needed; cur++) { ents[cur] = new_cache_ent(bsize); if (ents[cur] == NULL) break; bc.cur_blocks++; } /* pluck off blocks from the LRU end of the normal list, keep trying too */ while(cur < num_needed && retry_counter < max_retry) { for(ce=bc.normal.lru; ce && cur < num_needed; ce=ce->next) { if (ce->lock) panic("get_ents: normal list has locked blocks (ce 0x%x)\n",ce); if (ce->flags & CE_BUSY) /* don't touch busy blocks */ continue; ce->flags |= CE_BUSY; ents[cur++] = ce; } if (cur < num_needed) { UNLOCK(bc.lock); snooze(10000); LOCK(bc.lock); retry_counter++; } } if (cur < num_needed && retry_counter >= max_retry) { /* oh shit! */ dump_cache_list(); UNLOCK(bc.lock); panic("get_ents: waited too long; can't get enough ce's (c %d n %d)\n", cur, num_needed); } /* If the last block is a dirty one, try to get more of 'em so that we can flush a bunch of blocks at once. */ if (cur && cur < max && ((ents[cur-1]->flags & CE_DIRTY) || ents[cur-1]->clone)) { for(ce=ents[cur-1]->next; ce && cur < max; ce=ce->next) { if (ce->flags & CE_BUSY) /* don't touch busy blocks */ continue; if (ce->lock) panic("get_ents:2 dirty list has locked blocks (ce 0x%x)\n",ce); ce->flags |= CE_BUSY; ents[cur++] = ce; } } *num_gotten = cur; } static int read_into_ents(int dev, fs_off_t bnum, cache_ent **ents, int num, int bsize) { int i, ret; struct iovec *iov; iov = get_iovec_array(); for (i = 0; i < num; i++) { iov[i].iov_base = ents[i]->data; iov[i].iov_len = bsize; } if (chatty_io > 2) printf("readv @ %Ld for %d blocks (at %Ld, block_size = %d)\n", bnum, num, bnum*bsize, bsize); ret = readv_pos(dev, bnum*bsize, iov, num); release_iovec_array(iov); if (ret != num*bsize) { printf("read_into_ents: asked to read %d bytes but got %d\n", num*bsize, ret); printf("*** iov @ %p (num %d)\n", iov, num); return EINVAL; } else return 0; } #define CACHE_READ 0x0001 #define CACHE_WRITE 0x0002 #define CACHE_NOOP 0x0004 /* for getting empty blocks */ #define CACHE_LOCKED 0x0008 #define CACHE_READ_AHEAD_OK 0x0010 /* it's ok to do read-ahead */ static char * op_to_str(int op) { static char buff[128]; if (op & CACHE_READ) strcpy(buff, "READ"); else if (op & CACHE_WRITE) strcpy(buff, "WRITE"); else if (op & CACHE_NOOP) strcpy(buff, "NOP"); if (op & CACHE_LOCKED) strcat(buff, " LOCKED"); if (op & CACHE_READ_AHEAD_OK) strcat(buff, " (AHEAD)"); return buff; } static int cache_block_io(int dev, fs_off_t bnum, void *data, fs_off_t num_blocks, int bsize, int op, void **dataptr) { size_t err = 0; cache_ent *ce; cache_ent_list *cel; if (chatty_io > 1) printf("cbio: bnum = %Ld, num_blocks = %Ld, bsize = %d, op = %s\n", bnum, num_blocks, bsize, op_to_str(op)); /* some sanity checks first */ if (bsize == 0) panic("cache_io: block size == 0 for bnum %Ld?!?\n", bnum); if (num_blocks == 0) panic("cache_io: bnum %Ld has num_blocks == 0!\n", bnum); if (data == NULL && dataptr == NULL) { printf("major butthead move: null data and dataptr! bnum %Ld:%Ld\n", bnum, num_blocks); return ENOMEM; } if (data == NULL) { if (num_blocks != 1) /* get_block() should never do that */ panic("cache_io: num_blocks %Ld but should be 1\n", num_blocks); if (op & CACHE_WRITE) panic("cache_io: get_block() asked to write?!?\n"); } if (bnum + num_blocks > max_device_blocks[dev]) { printf("dev %d: access to blocks %Ld:%Ld but max_dev_blocks is %Ld\n", dev, bnum, num_blocks, max_device_blocks[dev]); // let the app crash here *(int *)0x3100 = 0xc0debabe; return EINVAL; } last_cache_access = system_time(); /* if the i/o is greater than 64k, do it directly */ if (num_blocks * bsize >= 64 * 1024) { char *ptr; fs_off_t tmp; if (data == NULL || (op & CACHE_LOCKED)) { panic("*** asked to do a large locked io that's too hard!\n"); } if (op & CACHE_READ) { if (read_phys_blocks(dev, bnum, data, num_blocks, bsize) != 0) { printf("cache read:read_phys_blocks failed (%s on blocks %Ld:%Ld)!\n", strerror(errno), bnum, num_blocks); return EINVAL; } LOCK(bc.lock); /* if any of the blocks are in the cache, grab them instead */ ptr = data; for(tmp=bnum; tmp < bnum+num_blocks; tmp++, ptr+=bsize) { ce = block_lookup(dev, tmp); /* if we find a block in the cache we have to copy its data just in case it is more recent than what we just read from disk (which could happen if someone wrote these blocks after we did the read but before we locked the cache and entered this loop). */ if (ce) { if (tmp != ce->block_num || dev != ce->dev) { UNLOCK(bc.lock); panic("*** error4: looked up dev %d block %Ld but " "found %d %Ld\n", dev, tmp, ce->dev, ce->block_num); } memcpy(ptr, ce->data, bsize); } } UNLOCK(bc.lock); } else if (op & CACHE_WRITE) { LOCK(bc.lock); /* if any of the blocks are in the cache, update them too */ ptr = data; for(tmp=bnum; tmp < bnum+num_blocks; tmp++, ptr+=bsize) { ce = block_lookup(dev, tmp); if (ce) { if (tmp != ce->block_num || dev != ce->dev) { UNLOCK(bc.lock); panic("*** error5: looked up dev %d block %Ld but " "found %d %Ld\n", dev, tmp, ce->dev, ce->block_num); return EBADF; } /* XXXdbg -- this isn't strictly necessary */ if (ce->clone) { printf("over-writing cloned data (ce %p bnum %Ld)...\n", ce, tmp); flush_cache_ent(ce); } /* copy the data into the cache */ memcpy(ce->data, ptr, bsize); } } UNLOCK(bc.lock); if (write_phys_blocks(dev, bnum, data, num_blocks, bsize) != 0) { printf("cache write: write_phys_blocks failed (%s on blocks " "%Ld:%Ld)!\n", strerror(errno), bnum, num_blocks); return EINVAL; } } else { printf("bad cache op %d (bnum %Ld nblocks %Ld)\n", op, bnum, num_blocks); return EINVAL; } return 0; } LOCK(bc.lock); while(num_blocks) { ce = block_lookup(dev, bnum); if (ce) { if (bnum != ce->block_num || dev != ce->dev) { UNLOCK(bc.lock); panic("*** error6: looked up dev %d block %ld but found " "%d %ld\n", dev, bnum, ce->dev, ce->block_num); return EBADF; } if (bsize != ce->bsize) { panic("*** requested bsize %d but ce->bsize %d ce @ 0x%x\n", bsize, ce->bsize, ce); } /* delete this ent from the list it is in because it may change */ if (ce->lock) cel = &bc.locked; else cel = &bc.normal; delete_from_list(cel, ce); if (op & CACHE_READ) { if (data && data != ce->data) { memcpy(data, ce->data, bsize); } else if (dataptr) { *dataptr = ce->data; } else { printf("cbio:data %p dptr %p ce @ %p ce->data %p\n", data, dataptr, ce, ce->data); } } else if (op & CACHE_WRITE) { if (data && data != ce->data) memcpy(ce->data, data, bsize); ce->flags |= CE_DIRTY; } else if (op & CACHE_NOOP) { memset(ce->data, 0, bsize); if (data) memset(data, 0, bsize); if (dataptr) *dataptr = ce->data; ce->flags |= CE_DIRTY; } else { panic("cached_block_io: bogus op %d\n", op); } if (op & CACHE_LOCKED) ce->lock++; if (ce->lock) cel = &bc.locked; else cel = &bc.normal; /* now put this ent at the head of the appropriate list */ add_to_head(cel, ce); if (data != NULL) data = (void *)((char *)data + bsize); bnum += 1; num_blocks -= 1; continue; } else { /* it's not in the cache */ int cur, cur_nblocks, num_dirty, real_nblocks, num_needed; cache_ent *ents[NUM_FLUSH_BLOCKS]; /* here we find out how many additional blocks in this request are not in the cache. the idea is that then we can do one big i/o on that many blocks at once. */ for(cur_nblocks=1; cur_nblocks < num_blocks && cur_nblocks < NUM_FLUSH_BLOCKS; cur_nblocks++) { /* we can call hash_lookup() directly instead of block_lookup() because we don't care about the state of the busy bit of the block at this point */ if (hash_lookup(&bc.ht, dev, bnum + cur_nblocks)) break; } /* here we try to figure out how many extra blocks we should read for read-ahead. we want to read as many as possible that are not already in the cache and that don't cause us to try and read beyond the end of the disk. */ if ((op & CACHE_READ) && (op & CACHE_READ_AHEAD_OK) && (cur_nblocks * bsize) < read_ahead_size) { for(num_needed=cur_nblocks; num_needed < (read_ahead_size / bsize); num_needed++) { if ((bnum + num_needed) >= max_device_blocks[dev]) break; if (hash_lookup(&bc.ht, dev, bnum + num_needed)) break; } } else { num_needed = cur_nblocks; } /* this will get us pointers to a bunch of cache_ents we can use */ get_ents(ents, num_needed, NUM_FLUSH_BLOCKS, &real_nblocks, bsize); if (real_nblocks < num_needed) { panic("don't have enough cache ents (need %d got %d %ld::%d)\n", num_needed, real_nblocks, bnum, num_blocks); } /* There are now three variables used as limits within the ents array. This is how they are related: cur_nblocks <= num_needed <= real_nblocks Ents from 0 to cur_nblocks-1 are going to be used to fulfill this IO request. Ents from cur_nblocks to num_needed-1 are for read-ahead. Ents from num_needed to real_nblocks are extra blocks that get_ents() asked us to flush. Often (and always on writes) cur_nblocks == num_needed. Below, we sort the list of ents so that when we flush them they go out in order. */ qsort(ents, real_nblocks, sizeof(cache_ent **), cache_ent_cmp); /* delete each ent from its list because it will change. also count up how many dirty blocks there are and insert into the hash table any new blocks so that no one else will try to read them in when we release the cache semaphore to do our I/O. */ for(cur=0,num_dirty=0; cur < real_nblocks; cur++) { ce = ents[cur]; ce->flags |= CE_BUSY; /* insert the new block into the hash table with its new block number. note that the block is still in the hash table for its old block number -- and it has to be until we are done flushing it from the cache (to prevent someone else from sneaking in in front of us and trying to read the same block that we're flushing). */ if (cur < num_needed) { if (hash_insert(&bc.ht, dev, bnum + cur, ce) != 0) panic("could not insert cache ent for %d %ld (0x%lx)\n", dev, bnum + cur, (ulong)ents[cur]); } if (ce->dev == -1) continue; if ((ce->flags & CE_DIRTY) || ce->clone) num_dirty++; if (ce->lock) panic("cbio: can't use locked blocks here ce @ 0x%x\n",ce); else cel = &bc.normal; delete_from_list(cel, ce); } ce = NULL; /* we release the block cache semaphore here so that we can go do all the i/o we need to do (flushing dirty blocks that we're kicking out as well as reading any new data). because all the blocks we're touching are marked busy no one else should mess with them while we're doing this. */ if (num_dirty || (op & CACHE_READ)) { UNLOCK(bc.lock); /* this flushes any blocks we're kicking out that are dirty */ if (num_dirty && (err = flush_ents(ents, real_nblocks)) != 0) { printf("flush ents failed (ents @ 0x%lx, nblocks %d!\n", (ulong)ents, cur_nblocks); goto handle_err; } } /* now that everything is flushed to disk, go through and make sure that the data blocks we're going to use are the right block size for this current request (it's possible we're kicking out some smaller blocks and need to reallocate the data block pointer). We do this in two steps, first free'ing everything and then going through and doing the malloc's to try and be nice to the memory system (i.e. allow it to coalesce stuff, etc). */ err = 0; for(cur=0; cur < num_needed; cur++) { if (ents[cur]->bsize != bsize) { free(ents[cur]->data); ents[cur]->data = NULL; if (ents[cur]->clone) { free(ents[cur]->clone); ents[cur]->clone = NULL; } } } for(cur=0; cur < num_needed; cur++) { if (ents[cur]->data == NULL) { ents[cur]->data = (void *)malloc(bsize); ents[cur]->bsize = bsize; } if (ents[cur]->data == NULL) { printf("cache: no memory for block (bsize %d)!\n", bsize); err = ENOMEM; break; } } /* if this condition is true it's a pretty serious error. we'll try and back out gracefully but we're in pretty deep at this point and it ain't going to be easy. */ handle_err: if (err) { for(cur=0; cur < num_needed; cur++) { cache_ent *tmp_ce; tmp_ce = (cache_ent *)hash_delete(&bc.ht,dev,bnum+cur); if (tmp_ce != ents[cur]) { panic("hash_del0: %d %ld got 0x%lx, not 0x%lx\n", dev, bnum+cur, (ulong)tmp_ce, (ulong)ents[cur]); } tmp_ce = (cache_ent *)hash_delete(&bc.ht,ents[cur]->dev, ents[cur]->block_num); if (tmp_ce != ents[cur]) { panic("hash_del1: %d %ld got 0x%lx, not 0x%lx\n", ents[cur]->dev, ents[cur]->block_num, (ulong)tmp_ce, (ulong)ents[cur]); } ents[cur]->flags &= ~CE_BUSY; if (ents[cur]->data) free(ents[cur]->data); free(ents[cur]); ents[cur] = NULL; bc.cur_blocks--; } if (cur < real_nblocks) { LOCK(bc.lock); for(; cur < real_nblocks; cur++) { ents[cur]->flags &= ~CE_BUSY; /* we have to put them back here */ add_to_tail(&bc.normal, ents[cur]); } UNLOCK(bc.lock); } return ENOMEM; } /* If we go into this if statement, the block cache lock has *already been released* up above when we flushed the dirty entries. As always, since the blocks we're mucking with are marked busy, they shouldn't get messed with. */ err = 0; if (num_dirty || (op & CACHE_READ)) { /* this section performs the i/o that we need to do */ if (op & CACHE_READ) { err = read_into_ents(dev, bnum, ents, num_needed, bsize); } else { err = 0; } if (err != 0) { printf("err %s on dev %d block %Ld:%d (%d) " "data %p, ents[0] %p\n", strerror(errno), dev, bnum, cur_nblocks, bsize, data, ents[0]); } /* acquire the semaphore here so that we can go on mucking with the cache data structures. We need to delete old block numbers from the hash table and set the new block number's for the blocks we just read in. We also put the read-ahead blocks at the head of mru list. */ LOCK(bc.lock); } for(cur=0; cur < num_needed; cur++) { cache_ent *tmp_ce; ce = ents[cur]; if (ce->dev != -1) { tmp_ce = hash_delete(&bc.ht, ce->dev, ce->block_num); if (tmp_ce == NULL || tmp_ce != ce) { panic("*** hash_delete failure (ce 0x%x tce 0x%x)\n", ce, tmp_ce); } } if (err == 0 && cur >= cur_nblocks) { ce->dev = dev; ce->block_num = bnum + cur; ce->flags &= ~CE_BUSY; add_to_head(&bc.normal, ce); } } ce = NULL; /* clear the busy bit on the blocks we force-flushed and put them on the normal list since they're now clean. */ for(; cur < real_nblocks; cur++) { ents[cur]->flags &= ~CE_BUSY; if (ents[cur]->lock) panic("should not have locked blocks here (ce 0x%x)\n", ents[cur]); add_to_tail(&bc.normal, ents[cur]); } if (err) { /* then we have some cleanup to do */ for(cur=0; cur < num_needed; cur++) { cache_ent *tmp_ce; /* we delete all blocks from the cache so we don't leave partially written blocks in the cache */ tmp_ce = (cache_ent *)hash_delete(&bc.ht,dev,bnum+cur); if (tmp_ce != ents[cur]) { panic("hash_del: %d %ld got 0x%lx, not 0x%lx\n", dev, bnum+cur, (ulong)tmp_ce, (ulong)ents[cur]); } ce = ents[cur]; ce->flags &= ~CE_BUSY; free(ce->data); ce->data = NULL; free(ce); ents[cur] = NULL; bc.cur_blocks--; } ce = NULL; UNLOCK(bc.lock); return err; } /* last step: go through and make sure all the cache_ent structures have the right data in them, delete old guys, etc. */ for(cur=0; cur < cur_nblocks; cur++) { ce = ents[cur]; if (ce->dev != -1) { /* then clean this guy up */ if (ce->next || ce->prev) panic("ce @ 0x%x should not be in a list yet!\n", ce); if (ce->clone) free(ce->clone); if (ce->data == NULL) panic("ce @ 0x%lx has a null data ptr\n", (ulong)ce); } ce->dev = dev; ce->block_num = bnum + cur; ce->bsize = bsize; ce->flags = CE_NORMAL; ce->lock = 0; ce->clone = NULL; ce->func = ce->arg = NULL; ce->next = ce->prev = NULL; if (op & CACHE_READ) { if (data) memcpy(data, ce->data, bsize); } else if (op & CACHE_WRITE) { ce->flags |= CE_DIRTY; memcpy(ce->data, data, bsize); } else if (op & CACHE_NOOP) { memset(ce->data, 0, bsize); if (data) memset(data, 0, bsize); ce->flags |= CE_DIRTY; } if (op & CACHE_LOCKED) { ce->lock++; cel = &bc.locked; } else { cel = &bc.normal; } /* now stick this puppy at the head of the mru list */ add_to_head(cel, ce); if (dataptr) { *dataptr = ce->data; } if (data != NULL) data = (void *)((char *)data + bsize); else if (cur_nblocks != 1) panic("cache can't handle setting data_ptr twice!\n"); } /* end of for(cur=0; cur < cur_nblocks; cur++) */ bnum += cur_nblocks; num_blocks -= cur_nblocks; } /* end of else it's not in the cache */ } /* end of while(num_blocks) */ UNLOCK(bc.lock); return 0; } void * get_block(int dev, fs_off_t bnum, int bsize) { void *data; if (cache_block_io(dev, bnum, NULL, 1, bsize, CACHE_READ|CACHE_LOCKED|CACHE_READ_AHEAD_OK, &data) != 0) return NULL; return data; } void * get_empty_block(int dev, fs_off_t bnum, int bsize) { void *data; if (cache_block_io(dev, bnum, NULL, 1, bsize, CACHE_NOOP|CACHE_LOCKED, &data) != 0) return NULL; return data; } int cached_read(int dev, fs_off_t bnum, void *data, fs_off_t num_blocks, int bsize) { return cache_block_io(dev, bnum, data, num_blocks, bsize, CACHE_READ | CACHE_READ_AHEAD_OK, NULL); } int cached_write(int dev, fs_off_t bnum, const void *data, fs_off_t num_blocks,int bsize) { return cache_block_io(dev, bnum, (void *)data, num_blocks, bsize, CACHE_WRITE, NULL); } int cached_write_locked(int dev, fs_off_t bnum, const void *data, fs_off_t num_blocks, int bsize) { return cache_block_io(dev, bnum, (void *)data, num_blocks, bsize, CACHE_WRITE | CACHE_LOCKED, NULL); } void force_cache_flush(int dev, int prefer_log_blocks) { int i, count = 0; cache_ent *ce; cache_ent *ents[NUM_FLUSH_BLOCKS]; LOCK(bc.lock); for(ce=bc.normal.lru; ce; ce=ce->next) { if ((ce->dev == dev) && (ce->flags & CE_BUSY) == 0 && ((ce->flags & CE_DIRTY) || ce->clone) && ((prefer_log_blocks && ce->func) || (prefer_log_blocks == 0))) { ce->flags |= CE_BUSY; ents[count++] = ce; if (count >= NUM_FLUSH_BLOCKS) { break; } } } /* if we've got some room left, try and grab any cloned blocks */ if (count < NUM_FLUSH_BLOCKS) { for(ce=bc.locked.lru; ce; ce=ce->next) { if ((ce->dev == dev) && (ce->flags & CE_BUSY) == 0 && (ce->clone)) { ce->flags |= CE_BUSY; ents[count++] = ce; if (count >= NUM_FLUSH_BLOCKS) { break; } } } } UNLOCK(bc.lock); if (count != 0) { qsort(ents, count, sizeof(cache_ent **), cache_ent_cmp); flush_ents(ents, count); for(i=0; i < count; i++) ents[i]->flags &= ~CE_BUSY; } }