1/*	$NetBSD: octeon_rnm.c,v 1.16 2023/03/21 22:07:29 riastradh Exp $	*/
2
3/*
4 * Copyright (c) 2007 Internet Initiative Japan, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * Cavium Octeon Random Number Generator / Random Number Memory `RNM'
31 *
32 *	The RNM unit consists of:
33 *
34 *	1. 128 ring oscillators
35 *	2. an LFSR/SHA-1 conditioner
36 *	3. a 512-byte FIFO
37 *
38 *	When the unit is enabled, there are three modes of operation:
39 *
40 *	(a) deterministic: the ring oscillators are disabled and the
41 *	    LFSR/SHA-1 conditioner operates on fixed inputs to give
42 *	    reproducible results for testing,
43 *
44 *	(b) conditioned entropy: the ring oscillators are enabled and
45 *	    samples from them are fed through the LFSR/SHA-1
46 *	    conditioner before being put into the FIFO, and
47 *
48 *	(c) raw entropy: the ring oscillators are enabled, and a group
49 *	    of eight of them selected at any one time is sampled and
50 *	    fed into the FIFO.
51 *
52 *	Details:
53 *
54 *	- The FIFO is refilled whenever we read out of it, either with
55 *	  a load address or an IOBDMA operation.
56 *
57 *	- The conditioner takes 81 cycles to produce a 64-bit block of
58 *	  output in the FIFO whether in deterministic or conditioned
59 *	  entropy mode, each block consisting of the first 64 bits of a
60 *	  SHA-1 hash.
61 *
62 *	- A group of eight ring oscillators take 8 cycles to produce a
63 *	  64-bit block of output in the FIFO in raw entropy mode, each
64 *	  block consisting of eight consecutive samples from each RO in
65 *	  parallel.
66 *
67 *	The first sample of each RO always seems to be zero.  Further,
68 *	consecutive samples from a single ring oscillator are not
69 *	independent, so naive debiasing like a von Neumann extractor
70 *	falls flat on its face.  And parallel ring oscillators powered
71 *	by the same source may not be independent either, if they end
72 *	up locked.
73 *
74 *	We read out one FIFO's worth of raw samples from groups of 8
75 *	ring oscillators at a time, of 128 total, by going through them
76 *	round robin.  We take 32 consecutive samples from each ring
77 *	oscillator in a group of 8 in parallel before we count one bit
78 *	of entropy.  To get 256 bits of entropy, we read 4Kbit of data
79 *	from each of two 8-RO groups.
80 *
81 *	We could use the on-board LFSR/SHA-1 conditioner like the Linux
82 *	driver written by Cavium does, but it's not clear how many RO
83 *	samples go into the conditioner, and our entropy pool is a
84 *	perfectly good conditioner itself, so it seems there is little
85 *	advantage -- other than expedience -- to using the LFSR/SHA-1
86 *	conditioner.  All the manual says is that it samples 125 of the
87 *	128 ROs.  But the Cavium SHA-1 CPU instruction is advertised to
88 *	have a latency of 100 cycles, so it seems implausible that much
89 *	more than one sample from each RO could be squeezed in there.
90 *
91 *	The hardware exposes only 64 bits of each SHA-1 hash, and the
92 *	Linux driver uses 32 bits of that -- which, if treated as full
93 *	entropy, would mean an assessment of 3.9 bits of RO samples to
94 *	get 1 bit of entropy, whereas we take 256 bits of RO samples to
95 *	get one bit of entropy, so this seems reasonably conservative.
96 *
97 * Reference: Cavium Networks OCTEON Plus CN50XX Hardware Reference
98 * Manual, CN50XX-HM-0.99E PRELIMINARY, July 2008.
99 */
100
101#include <sys/cdefs.h>
102__KERNEL_RCSID(0, "$NetBSD: octeon_rnm.c,v 1.16 2023/03/21 22:07:29 riastradh Exp $");
103
104#include <sys/param.h>
105#include <sys/device.h>
106#include <sys/kernel.h>
107#include <sys/rndsource.h>
108#include <sys/systm.h>
109
110#include <mips/locore.h>
111#include <mips/cavium/octeonreg.h>
112#include <mips/cavium/octeonvar.h>
113#include <mips/cavium/include/iobusvar.h>
114#include <mips/cavium/dev/octeon_rnmreg.h>
115#include <mips/cavium/dev/octeon_corereg.h>
116
117#include <sys/bus.h>
118
119//#define	OCTRNM_DEBUG
120
121#define	ENT_DELAY_CLOCK 8	/* cycles for each 64-bit RO sample batch */
122#define	LFSR_DELAY_CLOCK 81	/* cycles to fill LFSR buffer */
123#define	SHA1_DELAY_CLOCK 81	/* cycles to compute SHA-1 output */
124#define	NROGROUPS	16
125#define	RNG_FIFO_WORDS	(512/sizeof(uint64_t))
126
127struct octrnm_softc {
128	uint64_t		sc_sample[RNG_FIFO_WORDS];
129	bus_space_tag_t		sc_bust;
130	bus_space_handle_t	sc_regh;
131	krndsource_t		sc_rndsrc;	/* /dev/random source */
132	unsigned		sc_rogroup;
133};
134
135static int octrnm_match(device_t, struct cfdata *, void *);
136static void octrnm_attach(device_t, device_t, void *);
137static void octrnm_rng(size_t, void *);
138static void octrnm_reset(struct octrnm_softc *);
139static void octrnm_conditioned_deterministic(struct octrnm_softc *);
140static void octrnm_conditioned_entropy(struct octrnm_softc *);
141static void octrnm_raw_entropy(struct octrnm_softc *, unsigned);
142static uint64_t octrnm_load(struct octrnm_softc *);
143static void octrnm_iobdma(struct octrnm_softc *, uint64_t *, unsigned);
144static void octrnm_delay(uint32_t);
145
146CFATTACH_DECL_NEW(octrnm, sizeof(struct octrnm_softc),
147    octrnm_match, octrnm_attach, NULL, NULL);
148
149static int
150octrnm_match(device_t parent, struct cfdata *cf, void *aux)
151{
152	struct iobus_attach_args *aa = aux;
153
154	if (strcmp(cf->cf_name, aa->aa_name) != 0)
155		return 0;
156	if (cf->cf_unit != aa->aa_unitno)
157		return 0;
158	return 1;
159}
160
161static void
162octrnm_attach(device_t parent, device_t self, void *aux)
163{
164	struct octrnm_softc *sc = device_private(self);
165	struct iobus_attach_args *aa = aux;
166	uint64_t bist_status, sample, expected = UINT64_C(0xd654ff35fadf866b);
167
168	aprint_normal("\n");
169
170	/* Map the device registers, all two of them.  */
171	sc->sc_bust = aa->aa_bust;
172	if (bus_space_map(aa->aa_bust, aa->aa_unit->addr, RNM_SIZE,
173	    0, &sc->sc_regh) != 0) {
174		aprint_error_dev(self, "unable to map device\n");
175		return;
176	}
177
178	/* Verify that the built-in self-test succeeded.  */
179	bist_status = bus_space_read_8(sc->sc_bust, sc->sc_regh,
180	    RNM_BIST_STATUS_OFFSET);
181	if (bist_status) {
182		aprint_error_dev(self, "RNG built in self test failed: %#lx\n",
183		    bist_status);
184		return;
185	}
186
187	/*
188	 * Reset the core, enable the RNG engine without entropy, wait
189	 * 81 cycles for it to produce a single sample, and draw the
190	 * deterministic sample to test.
191	 *
192	 * XXX Verify that the output matches the SHA-1 computation
193	 * described by the data sheet, not just a known answer.
194	 */
195	octrnm_reset(sc);
196	octrnm_conditioned_deterministic(sc);
197	octrnm_delay(LFSR_DELAY_CLOCK + SHA1_DELAY_CLOCK);
198	sample = octrnm_load(sc);
199	if (sample != expected)
200		aprint_error_dev(self, "self-test: read %016"PRIx64","
201		    " expected %016"PRIx64, sample, expected);
202
203	/*
204	 * Reset the core again to clear the FIFO, and enable the RNG
205	 * engine with entropy exposed directly.  Start from the first
206	 * group of ring oscillators; as we gather samples we will
207	 * rotate through the rest of them.
208	 */
209	octrnm_reset(sc);
210	sc->sc_rogroup = 0;
211	octrnm_raw_entropy(sc, sc->sc_rogroup);
212	octrnm_delay(ENT_DELAY_CLOCK*RNG_FIFO_WORDS);
213
214	/* Attach the rndsource.  */
215	rndsource_setcb(&sc->sc_rndsrc, octrnm_rng, sc);
216	rnd_attach_source(&sc->sc_rndsrc, device_xname(self), RND_TYPE_RNG,
217	    RND_FLAG_DEFAULT | RND_FLAG_HASCB);
218}
219
220static void
221octrnm_rng(size_t nbytes, void *vsc)
222{
223	const unsigned BPB = 256; /* bits of data per bit of entropy */
224	struct octrnm_softc *sc = vsc;
225	uint64_t *samplepos;
226	size_t needed = NBBY*nbytes;
227	unsigned i;
228
229	/* Sample the ring oscillators round-robin.  */
230	while (needed) {
231		/*
232		 * Switch to the next RO group once we drain the FIFO.
233		 * By the time rnd_add_data is done, we will have
234		 * processed all 512 bytes of the FIFO.  We assume it
235		 * takes at least one cycle per byte (realistically,
236		 * more like ~80cpb to draw from the FIFO and then
237		 * process it with rnd_add_data), so there is no need
238		 * for any other delays.
239		 */
240		sc->sc_rogroup++;
241		sc->sc_rogroup %= NROGROUPS;
242		octrnm_raw_entropy(sc, sc->sc_rogroup);
243
244		/*
245		 * Gather quarter the FIFO at a time -- we are limited
246		 * to 128 bytes because of limits on the CVMSEG buffer.
247		 */
248		CTASSERT(sizeof sc->sc_sample == 512);
249		CTASSERT(__arraycount(sc->sc_sample) == RNG_FIFO_WORDS);
250		for (samplepos = sc->sc_sample, i = 0; i < 4; i++) {
251			octrnm_iobdma(sc, samplepos, RNG_FIFO_WORDS / 4);
252			samplepos += RNG_FIFO_WORDS / 4;
253		}
254#ifdef OCTRNM_DEBUG
255		hexdump(printf, "rnm", sc->sc_sample, sizeof sc->sc_sample);
256#endif
257		rnd_add_data_sync(&sc->sc_rndsrc, sc->sc_sample,
258		    sizeof sc->sc_sample, NBBY*sizeof(sc->sc_sample)/BPB);
259		needed -= MIN(needed, MAX(1, NBBY*sizeof(sc->sc_sample)/BPB));
260
261		/* Now's a good time to yield.  */
262		preempt_point();
263	}
264
265	/* Zero the sample.  */
266	explicit_memset(sc->sc_sample, 0, sizeof sc->sc_sample);
267}
268
269/*
270 * octrnm_reset(sc)
271 *
272 *	Reset the RNM unit, disabling it and clearing the FIFO.
273 */
274static void
275octrnm_reset(struct octrnm_softc *sc)
276{
277
278	bus_space_write_8(sc->sc_bust, sc->sc_regh, RNM_CTL_STATUS_OFFSET,
279	    RNM_CTL_STATUS_RNG_RST|RNM_CTL_STATUS_RNM_RST);
280}
281
282/*
283 * octrnm_conditioned_deterministic(sc)
284 *
285 *	Switch the RNM unit into the deterministic LFSR/SHA-1 mode with
286 *	no entropy, for the next data loaded into the FIFO.
287 */
288static void
289octrnm_conditioned_deterministic(struct octrnm_softc *sc)
290{
291
292	bus_space_write_8(sc->sc_bust, sc->sc_regh, RNM_CTL_STATUS_OFFSET,
293	    RNM_CTL_STATUS_RNG_EN);
294}
295
296/*
297 * octrnm_conditioned_entropy(sc)
298 *
299 *	Switch the RNM unit to generate ring oscillator samples
300 *	conditioned with an LFSR/SHA-1, for the next data loaded into
301 *	the FIFO.
302 */
303static void __unused
304octrnm_conditioned_entropy(struct octrnm_softc *sc)
305{
306
307	bus_space_write_8(sc->sc_bust, sc->sc_regh, RNM_CTL_STATUS_OFFSET,
308	    RNM_CTL_STATUS_RNG_EN|RNM_CTL_STATUS_ENT_EN);
309}
310
311/*
312 * octrnm_raw_entropy(sc, rogroup)
313 *
314 *	Switch the RNM unit to generate raw ring oscillator samples
315 *	from the specified group of eight ring oscillator.
316 */
317static void
318octrnm_raw_entropy(struct octrnm_softc *sc, unsigned rogroup)
319{
320	uint64_t ctl = 0;
321
322	ctl |= RNM_CTL_STATUS_RNG_EN;	/* enable FIFO */
323	ctl |= RNM_CTL_STATUS_ENT_EN;	/* enable entropy source */
324	ctl |= RNM_CTL_STATUS_EXP_ENT;	/* expose entropy without LFSR/SHA-1 */
325	ctl |= __SHIFTIN(rogroup, RNM_CTL_STATUS_ENT_SEL_MASK);
326
327	bus_space_write_8(sc->sc_bust, sc->sc_regh, RNM_CTL_STATUS_OFFSET,
328	    ctl);
329}
330
331/*
332 * octrnm_load(sc)
333 *
334 *	Load a single 64-bit word out of the FIFO.
335 */
336static uint64_t
337octrnm_load(struct octrnm_softc *sc)
338{
339	uint64_t addr = OCTEON_ADDR_IO_DID(RNM_MAJOR_DID, RNM_SUB_DID);
340
341	return octeon_xkphys_read_8(addr);
342}
343
344/*
345 * octrnm_iobdma(sc, buf, nwords)
346 *
347 *	Load nwords, at most 32, out of the FIFO into buf.
348 */
349static void
350octrnm_iobdma(struct octrnm_softc *sc, uint64_t *buf, unsigned nwords)
351{
352 	/* ``scraddr'' part is index in 64-bit words, not address */
353	size_t scraddr = OCTEON_CVMSEG_OFFSET(csm_rnm);
354	uint64_t iobdma = IOBDMA_CREATE(RNM_MAJOR_DID, RNM_SUB_DID,
355	    scraddr / sizeof(uint64_t), nwords, 0);
356
357	KASSERT(nwords < 128);			/* iobdma address restriction */
358	KASSERT(nwords <= CVMSEG_LM_RNM_SIZE);	/* size of CVMSEG LM buffer */
359
360	octeon_iobdma_write_8(iobdma);
361	OCTEON_SYNCIOBDMA;
362	for (; nwords --> 0; scraddr += 8)
363		*buf++ = octeon_cvmseg_read_8(scraddr);
364}
365
366/*
367 * octrnm_delay(ncycles)
368 *
369 *	Wait ncycles, at most UINT32_MAX/2 so we behave reasonably even
370 *	if the cycle counter rolls over.
371 */
372static void
373octrnm_delay(uint32_t ncycles)
374{
375	uint32_t deadline = mips3_cp0_count_read() + ncycles;
376
377	KASSERT(ncycles <= UINT32_MAX/2);
378
379	while ((deadline - mips3_cp0_count_read()) < ncycles)
380		continue;
381}
382