1""" 2Storing 25.000.000 rows in a Metakit file. 3 4(C) Christian Tismer, Professional Net Service 5 first version from 990822 6 update: improved, faster spreading. 7 8This implementation is hereby donated to JCW, therefore 9(C) Jean-Claude Wippler (Equi4 Software) 1999 10 11Data structure: 12We split the main view by some number of subviews. 13This gives us one level of indirection. 14 15First simple test: 1610 fields of tiny integers. 17Addressing is done only by row number. 18 19In order to allow for deletes and inserts, we keep 20a list of block sizes and do a little B-tree like 21juggling. 22 23""" 24 25import Mk4py 26mk=Mk4py 27 28import whrandom, string, sys, bisect 29 30class big_mk: 31 def __init__(self, dbpath, rw): 32 self.db = mk.storage(dbpath, rw) 33 34 def getas(self, struc_str): 35 parts = string.split(struc_str, "[", 1) 36 if len(parts) < 2: 37 self.db.getas(parts[0]) 38 return # this was a delete 39 self.main_name, rest = parts 40 ret= big_view(self.db.getas("%s[data[%s]" % (self.main_name, rest))) 41 ret.big_db = self 42 return ret 43 44 def commit(self): return self.db.commit() 45 def rollback(self): return self.db.rollback() 46 def description(self): return self.db.description() 47 48class big_view: 49 def __init__(self, view): 50 self.view = view 51 if len(self.view) == 0: 52 self.view.append() 53 self.calc_recnos() 54 self.blocksize = blocksize 55 self.lower = self.blocksize *2/3 56 self.upper = self.blocksize *2 - self.lower 57 self.bisect = bisect.bisect 58 self.names = [] 59 for prop in self.view[-1].data.structure(): 60 self.names.append(prop.name) 61 62 def __len__(self): 63 rn = self.recnos 64 if not rn: return 0 65 return rn[-1]+len(self.view[-1].data) 66 67 def __getitem__(self, idx): 68 main, sub = self._seek(idx) 69 return self.view[main].data[sub] 70 71 def __setitem__(self, idx, rec): 72 main, sub = self._seek(idx) 73 self.view[main].data[sub] = rec 74 75 """ 76 we can't do slices yet, since I have no idea 77 if this is necessary, and I don't see exactly 78 how this should work 79 """ 80 81 def append(self, record = None): 82 v = self.view 83 if not self.recnos or len(v[-1].data) >= self.blocksize: 84 v.append() 85 self.calc_recnos() 86 return self.recnos[-1] + v[-1].data.append(record) 87 88 def insert(self, idx, rec=None): 89 main, sub = self._seek(idx) 90 view = self.view[main].data 91 view.insert(sub, rec) 92 if len(view) > self.upper: 93 self._balance(main) 94 self.calc_recnos() 95 96 def delete(self, idx): 97 main, sub = self._seek(idx) 98 view = self.view[main].data 99 view.delete(sub) 100 if len(view) <= self.lower: 101 self._balance(main) 102 self.calc_recnos() 103 104 def _seek(self, idx): 105 rn = self.recnos 106 pos = self.bisect(rn, idx)-1 107 base = rn[pos] 108 return pos, idx-base 109 110 def calc_recnos(self): 111 v = self.view 112 res = [None] * len(v) 113 recno = 0 114 for i in range(len(res)): 115 res[i] = recno 116 recno = recno + len(v[i].data) 117 self.recnos = res 118 119 def _balance(self, spot): 120 """ 121 very simple approach: we merge about three 122 blocks and spread them again. 123 """ 124 v = self.view 125 if spot < 0 or spot > len(v) or len(v)==1 : return 126 if spot > 0: 127 spot = spot-1 128 self._merge(spot) 129 if spot < len(v)-1: 130 self._merge(spot) 131 self._spread(spot) 132 133 def _merge(self, spot): 134 """ 135 merge this block and the next one. 136 Delete the then empty next one 137 """ 138 v = self.view 139 v[spot].data = v[spot].data + v[spot+1].data 140 v.delete(spot+1) 141 142 def _spread(self, spot): 143 """ 144 Spread this block into equally sized ones. 145 """ 146 v = self.view 147 source = v[spot] 148 bs = self.blocksize 149 nblocks = (len(source.data)+bs-1) / bs 150 chunk = len(source.data) / nblocks +1 151 if chunk >= self.upper-10 or chunk < self.lower+10: chunk = bs 152 chunk, nextchunk = len(source.data) % chunk, chunk 153 if chunk < self.lower: 154 chunk = chunk + nextchunk 155 while len(source.data) > chunk: 156 self.view.insert(spot+1) 157 self.view[spot+1].data=source.data[-chunk:] 158 source.data = source.data[:-chunk] 159 chunk = nextchunk 160 return 161 162 def _getrec(self, subview): 163 res = [] 164 ga = getattr 165 for name in self.names: 166 res.append(ga(subview, name)) 167 return res 168 169 def __del__(self): 170 del self.view 171 172view_struc = "big_test[A:I,B:I,C:I,D:I,E:I,F:I,G:I,H:I,J:I,K:I]" 173 174dbpath = "_bigfile.mk" 175 176n_fields = 10 177 178rand_len = 4096 + n_fields 179 180random_ints = map(lambda x:whrandom.randint(0,256), range(rand_len)) 181 182blocksize = 10000 # default size for append, but not mandatory 183 184ds = None 185 186db = big_mk(dbpath, 1) 187 188ds = db.getas(view_struc) 189 190def make_rec(idx): 191 return random_ints[idx:idx+n_fields] 192 193def add_recs(n=1000): 194 for i in range(n): 195 idx = len(ds) % rand_len 196 ds.append(make_rec(idx)) 197 198if __name__ == '__main__': 199 # expect this to take hours (1000..2000 recs/sec on modern PII) 200 import sys 201 for i in xrange(25000): 202 add_recs() 203 sys.stdout.write(".") 204 sys.stdout.flush() 205 if i % 50 == 49: 206 db.commit() 207 sys.stdout.write("\n") 208