PerfectHashBuilder.java revision 15122:b211a52a7439
1326941Sdim/*
2326941Sdim * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
3353358Sdim * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4353358Sdim *
5353358Sdim * This code is free software; you can redistribute it and/or modify it
6326941Sdim * under the terms of the GNU General Public License version 2 only, as
7326941Sdim * published by the Free Software Foundation.  Oracle designates this
8326941Sdim * particular file as subject to the "Classpath" exception as provided
9326941Sdim * by Oracle in the LICENSE file that accompanied this code.
10326941Sdim *
11326941Sdim * This code is distributed in the hope that it will be useful, but WITHOUT
12326941Sdim * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13326941Sdim * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14326941Sdim * version 2 for more details (a copy is included in the LICENSE file that
15344779Sdim * accompanied this code).
16326941Sdim *
17326941Sdim * You should have received a copy of the GNU General Public License version
18326941Sdim * 2 along with this work; if not, write to the Free Software Foundation,
19326941Sdim * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20360784Sdim *
21326941Sdim * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22326941Sdim * or visit www.oracle.com if you need additional information or have any
23326941Sdim * questions.
24326941Sdim */
25326941Sdim
26326941Sdimpackage jdk.tools.jlink.internal;
27326941Sdim
28326941Sdimimport java.lang.reflect.Array;
29326941Sdimimport java.util.ArrayList;
30326941Sdimimport java.util.Arrays;
31326941Sdimimport java.util.LinkedHashMap;
32326941Sdimimport java.util.List;
33341825Sdimimport java.util.Map;
34326941Sdimimport jdk.internal.jimage.ImageStringsReader;
35341825Sdim
36341825Sdim/*
37326941Sdim * The algorithm used here is outlined in Applications of Finite Automata
38353358Sdim * Representing Large Vocabularies - Claudio L Lucchesi and Tomasz Kowaltowski,
39353358Sdim * 1992, and A Practical Minimal Perfect Hashing Method - Fabiano C. Botelho1,
40326941Sdim * Yoshiharu Kohayakawa, and Nivio Ziviani, 2005.
41341825Sdim *
42341825Sdim * The primary JDK use of this algorithm is managing the jimage location index.
43341825Sdim *
44341825Sdim * The goal of PerfectHashBuilder is to construct an automaton which maps a
45341825Sdim * string key to a unique index 0..N-1, where N is the number of key-value pairs.
46341825Sdim * What makes MPHM effective is that the size of the lookup table is N or very
47341825Sdim * near N, and the minimum lookup is O(1) maximum lookup is O(2).
48341825Sdim *
49341825Sdim * The result of PerfectHashBuilder is two integer arrays, redirect and order.
50360784Sdim * The redirect table provides a 1-1 mapping to the order table, using the
51360784Sdim * reader algorithm described further on.  The order table provides a mapping
52360784Sdim * to entries.  If entries are fixed size and can be put in a direct table, then
53360784Sdim * the order table can be used to construct the direct table and then discarded.
54326941Sdim *
55326941Sdim * The steps for constructing the lookup tables are as follows;
56341825Sdim *
57341825Sdim *   - Compute an MPHM hash for each key, based on a fixed base value modulo N.
58341825Sdim *     Note, the hash is based on the modified UTF-8 of the key, simplifying
59341825Sdim *     computation in native code.
60341825Sdim *
61341825Sdim *   - Combine keys that map to the same hash code (collisions) into bucket
62341825Sdim *     chains.
63341825Sdim *
64341825Sdim *   - Sort bucket chains by length of chains, longest first (most collisions.)
65360784Sdim *     Sorting is done to pack the redirect table with the worst collision
66360784Sdim *     offenders first.
67360784Sdim *
68360784Sdim *   - For each chain, recompute the hash of each key using a new base value.
69360784Sdim *     Recomputation should give a different key distribution. A tally is kept
70326941Sdim *     of where the key maps, using the order table. The tally is used to detect
71326941Sdim *     new collisions. If there are further collisions, then restart
72326941Sdim *     redistribution using a different hash base value.  If a chain is
73326941Sdim *     successfully distributed, then the base value used to compute the hash
74326941Sdim *     is recorded in the redirect table.
75326941Sdim *
76326941Sdim *   - Once all colliding chains are resolved (length > 1), then the chains with
77326941Sdim *     only one entry are used to fill in the empty slots in the order table.
78326941Sdim *     These keys are recorded in the redirect table using the twos complement
79326941Sdim *     of the order index.
80326941Sdim *
81326941Sdim *   - It is possible that a given set of keys cannot be packed into a table of
82326941Sdim *     size N.  If this situation occurs then the size of the table is
83326941Sdim *     adjusted so that keys distribute differently.
84326941Sdim *
85326941Sdim * Readers algoritm;
86326941Sdim *
87326941Sdim *   - Compute the hash for the key using the fixed base value modulo N.  This
88326941Sdim *     will provide an index into the redirect table. The integer value in the
89326941Sdim *     redirect table will determine the next step.
90326941Sdim *
91326941Sdim *   - If the value in the redirect table is positive, then that value is used
92326941Sdim *     to rehash the key to get the index into the order table.
93326941Sdim *
94326941Sdim *   - If the value in the redirect table is negative, then that value is the
95326941Sdim *     twos complement of the index into the order table.
96326941Sdim *
97326941Sdim *   - If the value in the redirect table is zero, then there is no matching
98326941Sdim *     entry.
99326941Sdim *
100326941Sdim *   - Note that the resulting entry needs to be validated to ensure a match.
101326941Sdim *     This is typically done by comparing the key with the key in entry.
102326941Sdim */
103326941Sdimpublic class PerfectHashBuilder<E> {
104326941Sdim    private static final int RETRY_LIMIT = 1000;
105326941Sdim
106326941Sdim    private Class<?> entryComponent;
107326941Sdim    private Class<?> bucketComponent;
108326941Sdim
109326941Sdim    private final Map<String, Entry<E>> map = new LinkedHashMap<>();
110326941Sdim    private int[] redirect;
111326941Sdim    private Entry<E>[] order;
112326941Sdim    private int count = 0;
113326941Sdim
114326941Sdim    @SuppressWarnings("EqualsAndHashcode")
115326941Sdim    public static class Entry<E> {
116326941Sdim        private final String key;
117326941Sdim        private final E value;
118326941Sdim
119326941Sdim        Entry() {
120326941Sdim            this("", null);
121326941Sdim        }
122326941Sdim
123326941Sdim        Entry(String key, E value) {
124326941Sdim            this.key = key;
125326941Sdim            this.value = value;
126341825Sdim        }
127326941Sdim
128326941Sdim        String getKey() {
129326941Sdim            return key;
130326941Sdim        }
131326941Sdim
132326941Sdim        E getValue() {
133326941Sdim            return value;
134326941Sdim        }
135326941Sdim
136326941Sdim        int hashCode(int seed) {
137326941Sdim            return ImageStringsReader.hashCode(key, seed);
138344779Sdim        }
139344779Sdim
140326941Sdim        @Override
141341825Sdim        public int hashCode() {
142344779Sdim            return ImageStringsReader.hashCode(key);
143353358Sdim        }
144353358Sdim
145353358Sdim        @Override
146353358Sdim        public boolean equals(Object other) {
147353358Sdim            if (other == this) {
148353358Sdim                return true;
149353358Sdim            }
150353358Sdim            if (!(other instanceof Entry)) {
151353358Sdim                return false;
152353358Sdim            }
153360784Sdim            Entry<?> entry = (Entry<?>) other;
154353358Sdim            return entry.key.equals(key);
155353358Sdim        }
156353358Sdim    }
157353358Sdim
158353358Sdim    static class Bucket<E> implements Comparable<Bucket<E>> {
159353358Sdim        final List<Entry<E>> list = new ArrayList<>();
160353358Sdim
161353358Sdim        void add(Entry<E> entry) {
162353358Sdim            list.add(entry);
163353358Sdim        }
164353358Sdim
165353358Sdim        int getSize() {
166353358Sdim            return list.size();
167341825Sdim        }
168341825Sdim
169353358Sdim        List<Entry<E>> getList() {
170353358Sdim            return list;
171341825Sdim        }
172344779Sdim
173341825Sdim        Entry<E> getFirst() {
174341825Sdim            assert !list.isEmpty() : "bucket should never be empty";
175341825Sdim            return list.get(0);
176326941Sdim        }
177326941Sdim
178341825Sdim        @Override
179341825Sdim        public int hashCode() {
180341825Sdim            return getFirst().hashCode();
181341825Sdim        }
182353358Sdim
183326941Sdim        @Override
184326941Sdim        @SuppressWarnings("EqualsWhichDoesntCheckParameterClass")
185341825Sdim        public boolean equals(Object obj) {
186344779Sdim            return this == obj;
187341825Sdim        }
188341825Sdim
189341825Sdim        @Override
190341825Sdim        public int compareTo(Bucket<E> o) {
191341825Sdim            return o.getSize() - getSize();
192344779Sdim        }
193360784Sdim    }
194344779Sdim
195341825Sdim    public PerfectHashBuilder(Class<?> entryComponent, Class<?> bucketComponent) {
196341825Sdim        this.entryComponent = entryComponent;
197341825Sdim        this.bucketComponent = bucketComponent;
198326941Sdim    }
199360784Sdim
200326941Sdim    public int getCount() {
201341825Sdim        return map.size();
202326941Sdim    }
203326941Sdim
204326941Sdim    public int[] getRedirect() {
205326941Sdim        return redirect.clone();
206326941Sdim    }
207344779Sdim
208326941Sdim    public Entry<E>[] getOrder() {
209341825Sdim        return order.clone();
210341825Sdim    }
211341825Sdim
212326941Sdim    public Entry<E> put(String key, E value) {
213326941Sdim        return put(new Entry<>(key, value));
214341825Sdim    }
215341825Sdim
216341825Sdim    public Entry<E> put(Entry<E> entry) {
217341825Sdim        Entry<E> old = map.put(entry.key, entry);
218341825Sdim
219341825Sdim        if (old == null) {
220341825Sdim            count++;
221341825Sdim        }
222341825Sdim
223341825Sdim        return old;
224341825Sdim    }
225341825Sdim
226341825Sdim    @SuppressWarnings("unchecked")
227326941Sdim    public void generate() {
228341825Sdim        // If the table is empty then exit early.
229326941Sdim        boolean redo = count != 0;
230326941Sdim
231326941Sdim        // Repeat until a valid packing is achieved.
232326941Sdim        while (redo) {
233326941Sdim            redo = false;
234326941Sdim
235326941Sdim            // Allocate the resulting redirect and order tables.
236326941Sdim            redirect = new int[count];
237326941Sdim            order = (Entry<E>[])Array.newInstance(entryComponent, count);
238326941Sdim
239344779Sdim            // Place all the entries in bucket chains based on hash. Sort by
240326941Sdim            // length of chain.
241326941Sdim            Bucket<E>[] sorted = createBuckets();
242326941Sdim            int free = 0;
243326941Sdim
244326941Sdim            // Iterate through the chains, longest first.
245326941Sdim            for (Bucket<E> bucket : sorted) {
246326941Sdim                if (bucket.getSize() != 1) {
247326941Sdim                    // Attempt to pack entries until no collisions occur.
248344779Sdim                    if (!collidedEntries(bucket, count)) {
249344779Sdim                        // Failed to pack. Meed to grow table.
250326941Sdim                        redo = true;
251326941Sdim                        break;
252344779Sdim                    }
253326941Sdim                } else {
254326941Sdim                    // A no collision entry (bucket.getSize() == 1). Find a free
255326941Sdim                    // spot in the order table.
256341825Sdim                    for ( ; free < count && order[free] != null; free++) {}
257341825Sdim
258341825Sdim                    // If none found, then grow table.
259344779Sdim                    if (free >= count) {
260341825Sdim                        redo = true;
261344779Sdim                        break;
262341825Sdim                    }
263341825Sdim
264326941Sdim                    // Store entry in order table.
265341825Sdim                    order[free] = bucket.getFirst();
266326941Sdim                    // Twos complement of order index stired in the redirect table.
267326941Sdim                    redirect[(bucket.hashCode() & 0x7FFFFFFF) % count] = -1 - free;
268326941Sdim                    // Update free slot index.
269326941Sdim                    free++;
270326941Sdim                }
271344779Sdim            }
272344779Sdim
273344779Sdim            // If packing failed, then bump table size. Make odd to increase
274344779Sdim            // chances of being relatively prime.
275344779Sdim            if (redo) {
276344779Sdim                count = (count + 1) | 1;
277341825Sdim            }
278341825Sdim        }
279341825Sdim    }
280326941Sdim
281326941Sdim    @SuppressWarnings("unchecked")
282326941Sdim    private Bucket<E>[] createBuckets() {
283326941Sdim        // Build bucket chains based on key hash.  Collisions end up in same chain.
284326941Sdim        Bucket<E>[] buckets = (Bucket<E>[])Array.newInstance(bucketComponent, count);
285353358Sdim
286353358Sdim        map.values().stream().forEach((entry) -> {
287353358Sdim            int index = (entry.hashCode() & 0x7FFFFFFF) % count;
288326941Sdim            Bucket<E> bucket = buckets[index];
289326941Sdim
290326941Sdim            if (bucket == null) {
291326941Sdim                buckets[index] = bucket = new Bucket<>();
292326941Sdim            }
293326941Sdim
294326941Sdim            bucket.add(entry);
295326941Sdim        });
296326941Sdim
297326941Sdim        // Sort chains, longest first.
298326941Sdim        Bucket<E>[] sorted = Arrays.asList(buckets).stream()
299326941Sdim                .filter((bucket) -> (bucket != null))
300326941Sdim                .sorted()
301326941Sdim                .toArray((length) -> {
302341825Sdim                    return (Bucket<E>[])Array.newInstance(bucketComponent, length);
303341825Sdim                });
304341825Sdim
305341825Sdim        return sorted;
306341825Sdim    }
307326941Sdim
308326941Sdim    private boolean collidedEntries(Bucket<E> bucket, int count) {
309326941Sdim        // Track packing attempts.
310326941Sdim        List<Integer> undo = new ArrayList<>();
311326941Sdim        // Start with a new hash seed.
312326941Sdim        int seed = ImageStringsReader.HASH_MULTIPLIER + 1;
313326941Sdim        int retry = 0;
314326941Sdim
315326941Sdim        // Attempt to pack all the entries in a single chain.
316341825Sdim        redo:
317341825Sdim        while (true) {
318341825Sdim            for (Entry<E> entry : bucket.getList()) {
319341825Sdim                // Compute new hash.
320326941Sdim                int index = entry.hashCode(seed) % count;
321326941Sdim
322326941Sdim                // If a collision is detected.
323326941Sdim                if (order[index] != null) {
324344779Sdim                    // Only retry so many times with current table size.
325344779Sdim                    if (++retry > RETRY_LIMIT) {
326344779Sdim                        return false;
327344779Sdim                    }
328344779Sdim
329341825Sdim                    // Undo the attempted packing.
330341825Sdim                    undo.stream().forEach((i) -> {
331341825Sdim                        order[i] = null;
332344779Sdim                    });
333326941Sdim
334344779Sdim                    // Reset the undo list and bump up the hash seed.
335341825Sdim                    undo.clear();
336344779Sdim                    seed++;
337326941Sdim
338344779Sdim                    // Zero seed is not valid.
339326941Sdim                    if (seed == 0) {
340344779Sdim                        seed = 1;
341341825Sdim                    }
342326941Sdim
343353358Sdim                    // Try again.
344353358Sdim                    continue redo;
345353358Sdim                }
346353358Sdim
347353358Sdim                // No collision.
348353358Sdim                order[index] = entry;
349353358Sdim                undo.add(index);
350353358Sdim            }
351353358Sdim
352353358Sdim            // Entire chain packed. Record hash seed used.
353353358Sdim            redirect[(bucket.hashCode() & 0x7FFFFFFF) % count] = seed;
354353358Sdim
355353358Sdim            break;
356353358Sdim        }
357353358Sdim
358        return true;
359    }
360 }
361