1"""
2Storing 25.000.000 rows in a Metakit file.
3
4(C) Christian Tismer, Professional Net Service
5  first version from 990822
6  update: improved, faster spreading.
7
8This implementation is hereby donated to JCW, therefore
9(C) Jean-Claude Wippler (Equi4 Software) 1999
10
11Data structure:
12We split the main view by some number of subviews.
13This gives us one level of indirection.
14
15First simple test:
1610 fields of tiny integers.
17Addressing is done only by row number.
18
19In order to allow for deletes and inserts, we keep
20a list of block sizes and do a little B-tree like
21juggling.
22
23"""
24
25import Mk4py
26mk=Mk4py
27
28import whrandom, string, sys, bisect
29
30class big_mk:
31  def __init__(self, dbpath, rw):
32    self.db = mk.storage(dbpath, rw)
33
34  def getas(self, struc_str):
35    parts = string.split(struc_str, "[", 1)
36    if len(parts) < 2:
37      self.db.getas(parts[0])
38      return   # this was a delete
39    self.main_name, rest = parts
40    ret= big_view(self.db.getas("%s[data[%s]" % (self.main_name, rest)))
41    ret.big_db = self
42    return ret
43
44  def commit(self): return self.db.commit()
45  def rollback(self): return self.db.rollback()
46  def description(self): return self.db.description()
47
48class big_view:
49  def __init__(self, view):
50    self.view = view
51    if len(self.view) == 0:
52      self.view.append()
53    self.calc_recnos()
54    self.blocksize = blocksize
55    self.lower = self.blocksize *2/3
56    self.upper = self.blocksize *2 - self.lower
57    self.bisect = bisect.bisect
58    self.names = []
59    for prop in self.view[-1].data.structure():
60      self.names.append(prop.name)
61
62  def __len__(self):
63    rn = self.recnos
64    if not rn: return 0
65    return rn[-1]+len(self.view[-1].data)
66
67  def __getitem__(self, idx):
68    main, sub = self._seek(idx)
69    return self.view[main].data[sub]
70
71  def __setitem__(self, idx, rec):
72    main, sub = self._seek(idx)
73    self.view[main].data[sub] = rec
74
75  """
76  we can't do slices yet, since I have no idea
77  if this is necessary, and I don't see exactly
78  how this should work
79  """
80
81  def append(self, record = None):
82    v = self.view
83    if not self.recnos or len(v[-1].data) >= self.blocksize:
84      v.append()
85      self.calc_recnos()
86    return self.recnos[-1] + v[-1].data.append(record)
87
88  def insert(self, idx, rec=None):
89    main, sub = self._seek(idx)
90    view = self.view[main].data
91    view.insert(sub, rec)
92    if len(view) > self.upper:
93      self._balance(main)
94    self.calc_recnos()
95
96  def delete(self, idx):
97    main, sub = self._seek(idx)
98    view = self.view[main].data
99    view.delete(sub)
100    if len(view) <= self.lower:
101      self._balance(main)
102    self.calc_recnos()
103
104  def _seek(self, idx):
105    rn = self.recnos
106    pos = self.bisect(rn, idx)-1
107    base = rn[pos]
108    return pos, idx-base
109
110  def calc_recnos(self):
111    v = self.view
112    res = [None] * len(v)
113    recno = 0
114    for i in range(len(res)):
115      res[i] = recno
116      recno = recno + len(v[i].data)
117    self.recnos = res
118
119  def _balance(self, spot):
120    """
121    very simple approach: we merge about three
122    blocks and spread them again.
123    """
124    v = self.view
125    if spot < 0 or spot > len(v) or len(v)==1 : return
126    if spot > 0:
127      spot = spot-1
128    self._merge(spot)
129    if spot < len(v)-1:
130      self._merge(spot)
131    self._spread(spot)
132
133  def _merge(self, spot):
134    """
135    merge this block and the next one.
136    Delete the then empty next one
137    """
138    v = self.view
139    v[spot].data = v[spot].data + v[spot+1].data
140    v.delete(spot+1)
141
142  def _spread(self, spot):
143    """
144    Spread this block into equally sized ones.
145    """
146    v = self.view
147    source = v[spot]
148    bs = self.blocksize
149    nblocks = (len(source.data)+bs-1) / bs
150    chunk = len(source.data) / nblocks +1
151    if chunk >= self.upper-10 or chunk < self.lower+10: chunk = bs
152    chunk, nextchunk = len(source.data) % chunk, chunk
153    if chunk < self.lower:
154      chunk = chunk + nextchunk
155    while len(source.data) > chunk:
156      self.view.insert(spot+1)
157      self.view[spot+1].data=source.data[-chunk:]
158      source.data = source.data[:-chunk]
159      chunk = nextchunk
160    return
161
162  def _getrec(self, subview):
163    res = []
164    ga = getattr
165    for name in self.names:
166      res.append(ga(subview, name))
167    return res
168
169  def __del__(self):
170    del self.view
171
172view_struc = "big_test[A:I,B:I,C:I,D:I,E:I,F:I,G:I,H:I,J:I,K:I]"
173
174dbpath = "_bigfile.mk"
175
176n_fields = 10
177
178rand_len = 4096 + n_fields
179
180random_ints = map(lambda x:whrandom.randint(0,256), range(rand_len))
181
182blocksize = 10000 # default size for append, but not mandatory
183
184ds = None
185
186db = big_mk(dbpath, 1)
187
188ds = db.getas(view_struc)
189
190def make_rec(idx):
191  return random_ints[idx:idx+n_fields]
192
193def add_recs(n=1000):
194  for i in range(n):
195    idx = len(ds) % rand_len
196    ds.append(make_rec(idx))
197
198if __name__ == '__main__':
199    # expect this to take hours (1000..2000 recs/sec on modern PII)
200    import sys
201    for i in xrange(25000):
202        add_recs()
203        sys.stdout.write(".")
204        sys.stdout.flush()
205        if i % 50 == 49:
206            db.commit()
207            sys.stdout.write("\n")
208