1/*-
2 * Copyright (c) 2009 Yahoo! Inc.
3 * Copyright (c) 2011-2015 LSI Corp.
4 * Copyright (c) 2013-2016 Avago Technologies
5 * Copyright 2000-2020 Broadcom Inc.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * Broadcom Inc. (LSI) MPT-Fusion Host Adapter FreeBSD
30 *
31 */
32
33#include <sys/cdefs.h>
34/* Communications core for Avago Technologies (LSI) MPT3 */
35
36/* TODO Move headers to mprvar */
37#include <sys/types.h>
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/selinfo.h>
42#include <sys/lock.h>
43#include <sys/mutex.h>
44#include <sys/module.h>
45#include <sys/bus.h>
46#include <sys/conf.h>
47#include <sys/bio.h>
48#include <sys/malloc.h>
49#include <sys/uio.h>
50#include <sys/sysctl.h>
51#include <sys/smp.h>
52#include <sys/queue.h>
53#include <sys/kthread.h>
54#include <sys/taskqueue.h>
55#include <sys/endian.h>
56#include <sys/eventhandler.h>
57#include <sys/sbuf.h>
58#include <sys/priv.h>
59
60#include <machine/bus.h>
61#include <machine/resource.h>
62#include <sys/rman.h>
63#include <sys/proc.h>
64
65#include <dev/pci/pcivar.h>
66
67#include <cam/cam.h>
68#include <cam/cam_ccb.h>
69#include <cam/scsi/scsi_all.h>
70
71#include <dev/mpr/mpi/mpi2_type.h>
72#include <dev/mpr/mpi/mpi2.h>
73#include <dev/mpr/mpi/mpi2_ioc.h>
74#include <dev/mpr/mpi/mpi2_sas.h>
75#include <dev/mpr/mpi/mpi2_pci.h>
76#include <dev/mpr/mpi/mpi2_cnfg.h>
77#include <dev/mpr/mpi/mpi2_init.h>
78#include <dev/mpr/mpi/mpi2_tool.h>
79#include <dev/mpr/mpr_ioctl.h>
80#include <dev/mpr/mprvar.h>
81#include <dev/mpr/mpr_table.h>
82#include <dev/mpr/mpr_sas.h>
83
84static int mpr_diag_reset(struct mpr_softc *sc, int sleep_flag);
85static int mpr_init_queues(struct mpr_softc *sc);
86static void mpr_resize_queues(struct mpr_softc *sc);
87static int mpr_message_unit_reset(struct mpr_softc *sc, int sleep_flag);
88static int mpr_transition_operational(struct mpr_softc *sc);
89static int mpr_iocfacts_allocate(struct mpr_softc *sc, uint8_t attaching);
90static void mpr_iocfacts_free(struct mpr_softc *sc);
91static void mpr_startup(void *arg);
92static int mpr_send_iocinit(struct mpr_softc *sc);
93static int mpr_alloc_queues(struct mpr_softc *sc);
94static int mpr_alloc_hw_queues(struct mpr_softc *sc);
95static int mpr_alloc_replies(struct mpr_softc *sc);
96static int mpr_alloc_requests(struct mpr_softc *sc);
97static int mpr_alloc_nvme_prp_pages(struct mpr_softc *sc);
98static int mpr_attach_log(struct mpr_softc *sc);
99static __inline void mpr_complete_command(struct mpr_softc *sc,
100    struct mpr_command *cm);
101static void mpr_dispatch_event(struct mpr_softc *sc, uintptr_t data,
102    MPI2_EVENT_NOTIFICATION_REPLY *reply);
103static void mpr_config_complete(struct mpr_softc *sc, struct mpr_command *cm);
104static void mpr_periodic(void *);
105static int mpr_reregister_events(struct mpr_softc *sc);
106static void mpr_enqueue_request(struct mpr_softc *sc, struct mpr_command *cm);
107static int mpr_get_iocfacts(struct mpr_softc *sc, MPI2_IOC_FACTS_REPLY *facts);
108static int mpr_wait_db_ack(struct mpr_softc *sc, int timeout, int sleep_flag);
109static int mpr_debug_sysctl(SYSCTL_HANDLER_ARGS);
110static int mpr_dump_reqs(SYSCTL_HANDLER_ARGS);
111static void mpr_parse_debug(struct mpr_softc *sc, char *list);
112static void adjust_iocfacts_endianness(MPI2_IOC_FACTS_REPLY *facts);
113
114SYSCTL_NODE(_hw, OID_AUTO, mpr, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
115    "MPR Driver Parameters");
116
117MALLOC_DEFINE(M_MPR, "mpr", "mpr driver memory");
118
119/*
120 * Do a "Diagnostic Reset" aka a hard reset.  This should get the chip out of
121 * any state and back to its initialization state machine.
122 */
123static char mpt2_reset_magic[] = { 0x00, 0x0f, 0x04, 0x0b, 0x02, 0x07, 0x0d };
124
125/*
126 * Added this union to smoothly convert le64toh cm->cm_desc.Words.
127 * Compiler only supports uint64_t to be passed as an argument.
128 * Otherwise it will throw this error:
129 * "aggregate value used where an integer was expected"
130 */
131typedef union {
132        u64 word;
133        struct {
134                u32 low;
135                u32 high;
136        } u;
137} request_descriptor_t;
138
139/* Rate limit chain-fail messages to 1 per minute */
140static struct timeval mpr_chainfail_interval = { 60, 0 };
141
142/*
143 * sleep_flag can be either CAN_SLEEP or NO_SLEEP.
144 * If this function is called from process context, it can sleep
145 * and there is no harm to sleep, in case if this fuction is called
146 * from Interrupt handler, we can not sleep and need NO_SLEEP flag set.
147 * based on sleep flags driver will call either msleep, pause or DELAY.
148 * msleep and pause are of same variant, but pause is used when mpr_mtx
149 * is not hold by driver.
150 */
151static int
152mpr_diag_reset(struct mpr_softc *sc,int sleep_flag)
153{
154	uint32_t reg;
155	int i, error, tries = 0;
156	uint8_t first_wait_done = FALSE;
157
158	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
159
160	/* Clear any pending interrupts */
161	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
162
163	/*
164	 * Force NO_SLEEP for threads prohibited to sleep
165 	 * e.a Thread from interrupt handler are prohibited to sleep.
166 	 */
167	if (curthread->td_no_sleeping)
168		sleep_flag = NO_SLEEP;
169
170	mpr_dprint(sc, MPR_INIT, "sequence start, sleep_flag=%d\n", sleep_flag);
171	/* Push the magic sequence */
172	error = ETIMEDOUT;
173	while (tries++ < 20) {
174		for (i = 0; i < sizeof(mpt2_reset_magic); i++)
175			mpr_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET,
176			    mpt2_reset_magic[i]);
177
178		/* wait 100 msec */
179		if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
180			msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
181			    "mprdiag", hz/10);
182		else if (sleep_flag == CAN_SLEEP)
183			pause("mprdiag", hz/10);
184		else
185			DELAY(100 * 1000);
186
187		reg = mpr_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
188		if (reg & MPI2_DIAG_DIAG_WRITE_ENABLE) {
189			error = 0;
190			break;
191		}
192	}
193	if (error) {
194		mpr_dprint(sc, MPR_INIT, "sequence failed, error=%d, exit\n",
195		    error);
196		return (error);
197	}
198
199	/* Send the actual reset.  XXX need to refresh the reg? */
200	reg |= MPI2_DIAG_RESET_ADAPTER;
201	mpr_dprint(sc, MPR_INIT, "sequence success, sending reset, reg= 0x%x\n",
202	    reg);
203	mpr_regwrite(sc, MPI2_HOST_DIAGNOSTIC_OFFSET, reg);
204
205	/* Wait up to 300 seconds in 50ms intervals */
206	error = ETIMEDOUT;
207	for (i = 0; i < 6000; i++) {
208		/*
209		 * Wait 50 msec. If this is the first time through, wait 256
210		 * msec to satisfy Diag Reset timing requirements.
211		 */
212		if (first_wait_done) {
213			if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
214				msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
215				    "mprdiag", hz/20);
216			else if (sleep_flag == CAN_SLEEP)
217				pause("mprdiag", hz/20);
218			else
219				DELAY(50 * 1000);
220		} else {
221			DELAY(256 * 1000);
222			first_wait_done = TRUE;
223		}
224		/*
225		 * Check for the RESET_ADAPTER bit to be cleared first, then
226		 * wait for the RESET state to be cleared, which takes a little
227		 * longer.
228		 */
229		reg = mpr_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
230		if (reg & MPI2_DIAG_RESET_ADAPTER) {
231			continue;
232		}
233		reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
234		if ((reg & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_RESET) {
235			error = 0;
236			break;
237		}
238	}
239	if (error) {
240		mpr_dprint(sc, MPR_INIT, "reset failed, error= %d, exit\n",
241		    error);
242		return (error);
243	}
244
245	mpr_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET, 0x0);
246	mpr_dprint(sc, MPR_INIT, "diag reset success, exit\n");
247
248	return (0);
249}
250
251static int
252mpr_message_unit_reset(struct mpr_softc *sc, int sleep_flag)
253{
254	int error;
255
256	MPR_FUNCTRACE(sc);
257
258	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
259
260	error = 0;
261	mpr_regwrite(sc, MPI2_DOORBELL_OFFSET,
262	    MPI2_FUNCTION_IOC_MESSAGE_UNIT_RESET <<
263	    MPI2_DOORBELL_FUNCTION_SHIFT);
264
265	if (mpr_wait_db_ack(sc, 5, sleep_flag) != 0) {
266		mpr_dprint(sc, MPR_INIT|MPR_FAULT,
267		    "Doorbell handshake failed\n");
268		error = ETIMEDOUT;
269	}
270
271	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
272	return (error);
273}
274
275static int
276mpr_transition_ready(struct mpr_softc *sc)
277{
278	uint32_t reg, state;
279	int error, tries = 0;
280	int sleep_flags;
281
282	MPR_FUNCTRACE(sc);
283	/* If we are in attach call, do not sleep */
284	sleep_flags = (sc->mpr_flags & MPR_FLAGS_ATTACH_DONE)
285	    ? CAN_SLEEP : NO_SLEEP;
286
287	error = 0;
288
289	mpr_dprint(sc, MPR_INIT, "%s entered, sleep_flags= %d\n",
290	    __func__, sleep_flags);
291
292	while (tries++ < 1200) {
293		reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
294		mpr_dprint(sc, MPR_INIT, "  Doorbell= 0x%x\n", reg);
295
296		/*
297		 * Ensure the IOC is ready to talk.  If it's not, try
298		 * resetting it.
299		 */
300		if (reg & MPI2_DOORBELL_USED) {
301			mpr_dprint(sc, MPR_INIT, "  Not ready, sending diag "
302			    "reset\n");
303			mpr_diag_reset(sc, sleep_flags);
304			DELAY(50000);
305			continue;
306		}
307
308		/* Is the adapter owned by another peer? */
309		if ((reg & MPI2_DOORBELL_WHO_INIT_MASK) ==
310		    (MPI2_WHOINIT_PCI_PEER << MPI2_DOORBELL_WHO_INIT_SHIFT)) {
311			mpr_dprint(sc, MPR_INIT|MPR_FAULT, "IOC is under the "
312			    "control of another peer host, aborting "
313			    "initialization.\n");
314			error = ENXIO;
315			break;
316		}
317
318		state = reg & MPI2_IOC_STATE_MASK;
319		if (state == MPI2_IOC_STATE_READY) {
320			/* Ready to go! */
321			error = 0;
322			break;
323		} else if (state == MPI2_IOC_STATE_FAULT) {
324			mpr_dprint(sc, MPR_INIT|MPR_FAULT, "IOC in fault "
325			    "state 0x%x, resetting\n",
326			    state & MPI2_DOORBELL_FAULT_CODE_MASK);
327			mpr_diag_reset(sc, sleep_flags);
328		} else if (state == MPI2_IOC_STATE_OPERATIONAL) {
329			/* Need to take ownership */
330			mpr_message_unit_reset(sc, sleep_flags);
331		} else if (state == MPI2_IOC_STATE_RESET) {
332			/* Wait a bit, IOC might be in transition */
333			mpr_dprint(sc, MPR_INIT|MPR_FAULT,
334			    "IOC in unexpected reset state\n");
335		} else {
336			mpr_dprint(sc, MPR_INIT|MPR_FAULT,
337			    "IOC in unknown state 0x%x\n", state);
338			error = EINVAL;
339			break;
340		}
341
342		/* Wait 50ms for things to settle down. */
343		DELAY(50000);
344	}
345
346	if (error)
347		mpr_dprint(sc, MPR_INIT|MPR_FAULT,
348		    "Cannot transition IOC to ready\n");
349	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
350	return (error);
351}
352
353static int
354mpr_transition_operational(struct mpr_softc *sc)
355{
356	uint32_t reg, state;
357	int error;
358
359	MPR_FUNCTRACE(sc);
360
361	error = 0;
362	reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
363	mpr_dprint(sc, MPR_INIT, "%s entered, Doorbell= 0x%x\n", __func__, reg);
364
365	state = reg & MPI2_IOC_STATE_MASK;
366	if (state != MPI2_IOC_STATE_READY) {
367		mpr_dprint(sc, MPR_INIT, "IOC not ready\n");
368		if ((error = mpr_transition_ready(sc)) != 0) {
369			mpr_dprint(sc, MPR_INIT|MPR_FAULT,
370			    "failed to transition ready, exit\n");
371			return (error);
372		}
373	}
374
375	error = mpr_send_iocinit(sc);
376	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
377
378	return (error);
379}
380
381static void
382mpr_resize_queues(struct mpr_softc *sc)
383{
384	u_int reqcr, prireqcr, maxio, sges_per_frame, chain_seg_size;
385
386	/*
387	 * Size the queues. Since the reply queues always need one free
388	 * entry, we'll deduct one reply message here.  The LSI documents
389	 * suggest instead to add a count to the request queue, but I think
390	 * that it's better to deduct from reply queue.
391	 */
392	prireqcr = MAX(1, sc->max_prireqframes);
393	prireqcr = MIN(prireqcr, sc->facts->HighPriorityCredit);
394
395	reqcr = MAX(2, sc->max_reqframes);
396	reqcr = MIN(reqcr, sc->facts->RequestCredit);
397
398	sc->num_reqs = prireqcr + reqcr;
399	sc->num_prireqs = prireqcr;
400	sc->num_replies = MIN(sc->max_replyframes + sc->max_evtframes,
401	    sc->facts->MaxReplyDescriptorPostQueueDepth) - 1;
402
403	/* Store the request frame size in bytes rather than as 32bit words */
404	sc->reqframesz = sc->facts->IOCRequestFrameSize * 4;
405
406	/*
407	 * Gen3 and beyond uses the IOCMaxChainSegmentSize from IOC Facts to
408	 * get the size of a Chain Frame.  Previous versions use the size as a
409	 * Request Frame for the Chain Frame size.  If IOCMaxChainSegmentSize
410	 * is 0, use the default value.  The IOCMaxChainSegmentSize is the
411	 * number of 16-byte elelements that can fit in a Chain Frame, which is
412	 * the size of an IEEE Simple SGE.
413	 */
414	if (sc->facts->MsgVersion >= MPI2_VERSION_02_05) {
415		chain_seg_size = sc->facts->IOCMaxChainSegmentSize;
416		if (chain_seg_size == 0)
417			chain_seg_size = MPR_DEFAULT_CHAIN_SEG_SIZE;
418		sc->chain_frame_size = chain_seg_size *
419		    MPR_MAX_CHAIN_ELEMENT_SIZE;
420	} else {
421		sc->chain_frame_size = sc->reqframesz;
422	}
423
424	/*
425	 * Max IO Size is Page Size * the following:
426	 * ((SGEs per frame - 1 for chain element) * Max Chain Depth)
427	 * + 1 for no chain needed in last frame
428	 *
429	 * If user suggests a Max IO size to use, use the smaller of the
430	 * user's value and the calculated value as long as the user's
431	 * value is larger than 0. The user's value is in pages.
432	 */
433	sges_per_frame = sc->chain_frame_size/sizeof(MPI2_IEEE_SGE_SIMPLE64)-1;
434	maxio = (sges_per_frame * sc->facts->MaxChainDepth + 1) * PAGE_SIZE;
435
436	/*
437	 * If I/O size limitation requested then use it and pass up to CAM.
438	 * If not, use maxphys as an optimization hint, but report HW limit.
439	 */
440	if (sc->max_io_pages > 0) {
441		maxio = min(maxio, sc->max_io_pages * PAGE_SIZE);
442		sc->maxio = maxio;
443	} else {
444		sc->maxio = maxio;
445		maxio = min(maxio, maxphys);
446	}
447
448	sc->num_chains = (maxio / PAGE_SIZE + sges_per_frame - 2) /
449	    sges_per_frame * reqcr;
450	if (sc->max_chains > 0 && sc->max_chains < sc->num_chains)
451		sc->num_chains = sc->max_chains;
452
453	/*
454	 * Figure out the number of MSIx-based queues.  If the firmware or
455	 * user has done something crazy and not allowed enough credit for
456	 * the queues to be useful then don't enable multi-queue.
457	 */
458	if (sc->facts->MaxMSIxVectors < 2)
459		sc->msi_msgs = 1;
460
461	if (sc->msi_msgs > 1) {
462		sc->msi_msgs = MIN(sc->msi_msgs, mp_ncpus);
463		sc->msi_msgs = MIN(sc->msi_msgs, sc->facts->MaxMSIxVectors);
464		if (sc->num_reqs / sc->msi_msgs < 2)
465			sc->msi_msgs = 1;
466	}
467
468	mpr_dprint(sc, MPR_INIT, "Sized queues to q=%d reqs=%d replies=%d\n",
469	    sc->msi_msgs, sc->num_reqs, sc->num_replies);
470}
471
472/*
473 * This is called during attach and when re-initializing due to a Diag Reset.
474 * IOC Facts is used to allocate many of the structures needed by the driver.
475 * If called from attach, de-allocation is not required because the driver has
476 * not allocated any structures yet, but if called from a Diag Reset, previously
477 * allocated structures based on IOC Facts will need to be freed and re-
478 * allocated bases on the latest IOC Facts.
479 */
480static int
481mpr_iocfacts_allocate(struct mpr_softc *sc, uint8_t attaching)
482{
483	int error;
484	Mpi2IOCFactsReply_t saved_facts;
485	uint8_t saved_mode, reallocating;
486
487	mpr_dprint(sc, MPR_INIT|MPR_TRACE, "%s entered\n", __func__);
488
489	/* Save old IOC Facts and then only reallocate if Facts have changed */
490	if (!attaching) {
491		bcopy(sc->facts, &saved_facts, sizeof(MPI2_IOC_FACTS_REPLY));
492	}
493
494	/*
495	 * Get IOC Facts.  In all cases throughout this function, panic if doing
496	 * a re-initialization and only return the error if attaching so the OS
497	 * can handle it.
498	 */
499	if ((error = mpr_get_iocfacts(sc, sc->facts)) != 0) {
500		if (attaching) {
501			mpr_dprint(sc, MPR_INIT|MPR_FAULT, "Failed to get "
502			    "IOC Facts with error %d, exit\n", error);
503			return (error);
504		} else {
505			panic("%s failed to get IOC Facts with error %d\n",
506			    __func__, error);
507		}
508	}
509
510	MPR_DPRINT_PAGE(sc, MPR_XINFO, iocfacts, sc->facts);
511
512	snprintf(sc->fw_version, sizeof(sc->fw_version),
513	    "%02d.%02d.%02d.%02d",
514	    sc->facts->FWVersion.Struct.Major,
515	    sc->facts->FWVersion.Struct.Minor,
516	    sc->facts->FWVersion.Struct.Unit,
517	    sc->facts->FWVersion.Struct.Dev);
518
519	snprintf(sc->msg_version, sizeof(sc->msg_version), "%d.%d",
520	    (sc->facts->MsgVersion & MPI2_IOCFACTS_MSGVERSION_MAJOR_MASK) >>
521	    MPI2_IOCFACTS_MSGVERSION_MAJOR_SHIFT,
522	    (sc->facts->MsgVersion & MPI2_IOCFACTS_MSGVERSION_MINOR_MASK) >>
523	    MPI2_IOCFACTS_MSGVERSION_MINOR_SHIFT);
524
525	mpr_dprint(sc, MPR_INFO, "Firmware: %s, Driver: %s\n", sc->fw_version,
526	    MPR_DRIVER_VERSION);
527	mpr_dprint(sc, MPR_INFO,
528	    "IOCCapabilities: %b\n", sc->facts->IOCCapabilities,
529	    "\20" "\3ScsiTaskFull" "\4DiagTrace" "\5SnapBuf" "\6ExtBuf"
530	    "\7EEDP" "\10BiDirTarg" "\11Multicast" "\14TransRetry" "\15IR"
531	    "\16EventReplay" "\17RaidAccel" "\20MSIXIndex" "\21HostDisc"
532	    "\22FastPath" "\23RDPQArray" "\24AtomicReqDesc" "\25PCIeSRIOV");
533
534	/*
535	 * If the chip doesn't support event replay then a hard reset will be
536	 * required to trigger a full discovery.  Do the reset here then
537	 * retransition to Ready.  A hard reset might have already been done,
538	 * but it doesn't hurt to do it again.  Only do this if attaching, not
539	 * for a Diag Reset.
540	 */
541	if (attaching && ((sc->facts->IOCCapabilities &
542	    MPI2_IOCFACTS_CAPABILITY_EVENT_REPLAY) == 0)) {
543		mpr_dprint(sc, MPR_INIT, "No event replay, resetting\n");
544		mpr_diag_reset(sc, NO_SLEEP);
545		if ((error = mpr_transition_ready(sc)) != 0) {
546			mpr_dprint(sc, MPR_INIT|MPR_FAULT, "Failed to "
547			    "transition to ready with error %d, exit\n",
548			    error);
549			return (error);
550		}
551	}
552
553	/*
554	 * Set flag if IR Firmware is loaded.  If the RAID Capability has
555	 * changed from the previous IOC Facts, log a warning, but only if
556	 * checking this after a Diag Reset and not during attach.
557	 */
558	saved_mode = sc->ir_firmware;
559	if (sc->facts->IOCCapabilities &
560	    MPI2_IOCFACTS_CAPABILITY_INTEGRATED_RAID)
561		sc->ir_firmware = 1;
562	if (!attaching) {
563		if (sc->ir_firmware != saved_mode) {
564			mpr_dprint(sc, MPR_INIT|MPR_FAULT, "new IR/IT mode "
565			    "in IOC Facts does not match previous mode\n");
566		}
567	}
568
569	/* Only deallocate and reallocate if relevant IOC Facts have changed */
570	reallocating = FALSE;
571	sc->mpr_flags &= ~MPR_FLAGS_REALLOCATED;
572
573	if ((!attaching) &&
574	    ((saved_facts.MsgVersion != sc->facts->MsgVersion) ||
575	    (saved_facts.HeaderVersion != sc->facts->HeaderVersion) ||
576	    (saved_facts.MaxChainDepth != sc->facts->MaxChainDepth) ||
577	    (saved_facts.RequestCredit != sc->facts->RequestCredit) ||
578	    (saved_facts.ProductID != sc->facts->ProductID) ||
579	    (saved_facts.IOCCapabilities != sc->facts->IOCCapabilities) ||
580	    (saved_facts.IOCRequestFrameSize !=
581	    sc->facts->IOCRequestFrameSize) ||
582	    (saved_facts.IOCMaxChainSegmentSize !=
583	    sc->facts->IOCMaxChainSegmentSize) ||
584	    (saved_facts.MaxTargets != sc->facts->MaxTargets) ||
585	    (saved_facts.MaxSasExpanders != sc->facts->MaxSasExpanders) ||
586	    (saved_facts.MaxEnclosures != sc->facts->MaxEnclosures) ||
587	    (saved_facts.HighPriorityCredit != sc->facts->HighPriorityCredit) ||
588	    (saved_facts.MaxReplyDescriptorPostQueueDepth !=
589	    sc->facts->MaxReplyDescriptorPostQueueDepth) ||
590	    (saved_facts.ReplyFrameSize != sc->facts->ReplyFrameSize) ||
591	    (saved_facts.MaxVolumes != sc->facts->MaxVolumes) ||
592	    (saved_facts.MaxPersistentEntries !=
593	    sc->facts->MaxPersistentEntries))) {
594		reallocating = TRUE;
595
596		/* Record that we reallocated everything */
597		sc->mpr_flags |= MPR_FLAGS_REALLOCATED;
598	}
599
600	/*
601	 * Some things should be done if attaching or re-allocating after a Diag
602	 * Reset, but are not needed after a Diag Reset if the FW has not
603	 * changed.
604	 */
605	if (attaching || reallocating) {
606		/*
607		 * Check if controller supports FW diag buffers and set flag to
608		 * enable each type.
609		 */
610		if (sc->facts->IOCCapabilities &
611		    MPI2_IOCFACTS_CAPABILITY_DIAG_TRACE_BUFFER)
612			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_TRACE].
613			    enabled = TRUE;
614		if (sc->facts->IOCCapabilities &
615		    MPI2_IOCFACTS_CAPABILITY_SNAPSHOT_BUFFER)
616			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_SNAPSHOT].
617			    enabled = TRUE;
618		if (sc->facts->IOCCapabilities &
619		    MPI2_IOCFACTS_CAPABILITY_EXTENDED_BUFFER)
620			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_EXTENDED].
621			    enabled = TRUE;
622
623		/*
624		 * Set flags for some supported items.
625		 */
626		if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EEDP)
627			sc->eedp_enabled = TRUE;
628		if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_TLR)
629			sc->control_TLR = TRUE;
630		if ((sc->facts->IOCCapabilities &
631		    MPI26_IOCFACTS_CAPABILITY_ATOMIC_REQ) &&
632		    (sc->mpr_flags & MPR_FLAGS_SEA_IOC))
633			sc->atomic_desc_capable = TRUE;
634
635		mpr_resize_queues(sc);
636
637		/*
638		 * Initialize all Tail Queues
639		 */
640		TAILQ_INIT(&sc->req_list);
641		TAILQ_INIT(&sc->high_priority_req_list);
642		TAILQ_INIT(&sc->chain_list);
643		TAILQ_INIT(&sc->prp_page_list);
644		TAILQ_INIT(&sc->tm_list);
645	}
646
647	/*
648	 * If doing a Diag Reset and the FW is significantly different
649	 * (reallocating will be set above in IOC Facts comparison), then all
650	 * buffers based on the IOC Facts will need to be freed before they are
651	 * reallocated.
652	 */
653	if (reallocating) {
654		mpr_iocfacts_free(sc);
655		mprsas_realloc_targets(sc, saved_facts.MaxTargets +
656		    saved_facts.MaxVolumes);
657	}
658
659	/*
660	 * Any deallocation has been completed.  Now start reallocating
661	 * if needed.  Will only need to reallocate if attaching or if the new
662	 * IOC Facts are different from the previous IOC Facts after a Diag
663	 * Reset. Targets have already been allocated above if needed.
664	 */
665	error = 0;
666	while (attaching || reallocating) {
667		if ((error = mpr_alloc_hw_queues(sc)) != 0)
668			break;
669		if ((error = mpr_alloc_replies(sc)) != 0)
670			break;
671		if ((error = mpr_alloc_requests(sc)) != 0)
672			break;
673		if ((error = mpr_alloc_queues(sc)) != 0)
674			break;
675		break;
676	}
677	if (error) {
678		mpr_dprint(sc, MPR_INIT|MPR_ERROR,
679		    "Failed to alloc queues with error %d\n", error);
680		mpr_free(sc);
681		return (error);
682	}
683
684	/* Always initialize the queues */
685	bzero(sc->free_queue, sc->fqdepth * 4);
686	mpr_init_queues(sc);
687
688	/*
689	 * Always get the chip out of the reset state, but only panic if not
690	 * attaching.  If attaching and there is an error, that is handled by
691	 * the OS.
692	 */
693	error = mpr_transition_operational(sc);
694	if (error != 0) {
695		mpr_dprint(sc, MPR_INIT|MPR_FAULT, "Failed to "
696		    "transition to operational with error %d\n", error);
697		mpr_free(sc);
698		return (error);
699	}
700
701	/*
702	 * Finish the queue initialization.
703	 * These are set here instead of in mpr_init_queues() because the
704	 * IOC resets these values during the state transition in
705	 * mpr_transition_operational().  The free index is set to 1
706	 * because the corresponding index in the IOC is set to 0, and the
707	 * IOC treats the queues as full if both are set to the same value.
708	 * Hence the reason that the queue can't hold all of the possible
709	 * replies.
710	 */
711	sc->replypostindex = 0;
712	mpr_regwrite(sc, MPI2_REPLY_FREE_HOST_INDEX_OFFSET, sc->replyfreeindex);
713	mpr_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, 0);
714
715	/*
716	 * Attach the subsystems so they can prepare their event masks.
717	 * XXX Should be dynamic so that IM/IR and user modules can attach
718	 */
719	error = 0;
720	while (attaching) {
721		mpr_dprint(sc, MPR_INIT, "Attaching subsystems\n");
722		if ((error = mpr_attach_log(sc)) != 0)
723			break;
724		if ((error = mpr_attach_sas(sc)) != 0)
725			break;
726		if ((error = mpr_attach_user(sc)) != 0)
727			break;
728		break;
729	}
730	if (error) {
731		mpr_dprint(sc, MPR_INIT|MPR_ERROR,
732		    "Failed to attach all subsystems: error %d\n", error);
733		mpr_free(sc);
734		return (error);
735	}
736
737	/*
738	 * XXX If the number of MSI-X vectors changes during re-init, this
739	 * won't see it and adjust.
740	 */
741	if ((attaching || reallocating) && (error = mpr_pci_setup_interrupts(sc)) != 0) {
742		mpr_dprint(sc, MPR_INIT|MPR_ERROR,
743		    "Failed to setup interrupts\n");
744		mpr_free(sc);
745		return (error);
746	}
747
748	return (error);
749}
750
751/*
752 * This is called if memory is being free (during detach for example) and when
753 * buffers need to be reallocated due to a Diag Reset.
754 */
755static void
756mpr_iocfacts_free(struct mpr_softc *sc)
757{
758	struct mpr_command *cm;
759	int i;
760
761	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
762
763	if (sc->free_busaddr != 0)
764		bus_dmamap_unload(sc->queues_dmat, sc->queues_map);
765	if (sc->free_queue != NULL)
766		bus_dmamem_free(sc->queues_dmat, sc->free_queue,
767		    sc->queues_map);
768	if (sc->queues_dmat != NULL)
769		bus_dma_tag_destroy(sc->queues_dmat);
770
771	if (sc->chain_frames != NULL) {
772		bus_dmamap_unload(sc->chain_dmat, sc->chain_map);
773		bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
774		    sc->chain_map);
775	}
776	if (sc->chain_dmat != NULL)
777		bus_dma_tag_destroy(sc->chain_dmat);
778
779	if (sc->sense_busaddr != 0)
780		bus_dmamap_unload(sc->sense_dmat, sc->sense_map);
781	if (sc->sense_frames != NULL)
782		bus_dmamem_free(sc->sense_dmat, sc->sense_frames,
783		    sc->sense_map);
784	if (sc->sense_dmat != NULL)
785		bus_dma_tag_destroy(sc->sense_dmat);
786
787	if (sc->prp_page_busaddr != 0)
788		bus_dmamap_unload(sc->prp_page_dmat, sc->prp_page_map);
789	if (sc->prp_pages != NULL)
790		bus_dmamem_free(sc->prp_page_dmat, sc->prp_pages,
791		    sc->prp_page_map);
792	if (sc->prp_page_dmat != NULL)
793		bus_dma_tag_destroy(sc->prp_page_dmat);
794
795	if (sc->reply_busaddr != 0)
796		bus_dmamap_unload(sc->reply_dmat, sc->reply_map);
797	if (sc->reply_frames != NULL)
798		bus_dmamem_free(sc->reply_dmat, sc->reply_frames,
799		    sc->reply_map);
800	if (sc->reply_dmat != NULL)
801		bus_dma_tag_destroy(sc->reply_dmat);
802
803	if (sc->req_busaddr != 0)
804		bus_dmamap_unload(sc->req_dmat, sc->req_map);
805	if (sc->req_frames != NULL)
806		bus_dmamem_free(sc->req_dmat, sc->req_frames, sc->req_map);
807	if (sc->req_dmat != NULL)
808		bus_dma_tag_destroy(sc->req_dmat);
809
810	if (sc->chains != NULL)
811		free(sc->chains, M_MPR);
812	if (sc->prps != NULL)
813		free(sc->prps, M_MPR);
814	if (sc->commands != NULL) {
815		for (i = 1; i < sc->num_reqs; i++) {
816			cm = &sc->commands[i];
817			bus_dmamap_destroy(sc->buffer_dmat, cm->cm_dmamap);
818		}
819		free(sc->commands, M_MPR);
820	}
821	if (sc->buffer_dmat != NULL)
822		bus_dma_tag_destroy(sc->buffer_dmat);
823
824	mpr_pci_free_interrupts(sc);
825	free(sc->queues, M_MPR);
826	sc->queues = NULL;
827}
828
829/*
830 * The terms diag reset and hard reset are used interchangeably in the MPI
831 * docs to mean resetting the controller chip.  In this code diag reset
832 * cleans everything up, and the hard reset function just sends the reset
833 * sequence to the chip.  This should probably be refactored so that every
834 * subsystem gets a reset notification of some sort, and can clean up
835 * appropriately.
836 */
837int
838mpr_reinit(struct mpr_softc *sc)
839{
840	int error;
841	struct mprsas_softc *sassc;
842
843	sassc = sc->sassc;
844
845	MPR_FUNCTRACE(sc);
846
847	mtx_assert(&sc->mpr_mtx, MA_OWNED);
848
849	mpr_dprint(sc, MPR_INIT|MPR_INFO, "Reinitializing controller\n");
850	if (sc->mpr_flags & MPR_FLAGS_DIAGRESET) {
851		mpr_dprint(sc, MPR_INIT, "Reset already in progress\n");
852		return 0;
853	}
854
855	/*
856	 * Make sure the completion callbacks can recognize they're getting
857	 * a NULL cm_reply due to a reset.
858	 */
859	sc->mpr_flags |= MPR_FLAGS_DIAGRESET;
860
861	/*
862	 * Mask interrupts here.
863	 */
864	mpr_dprint(sc, MPR_INIT, "Masking interrupts and resetting\n");
865	mpr_mask_intr(sc);
866
867	error = mpr_diag_reset(sc, CAN_SLEEP);
868	if (error != 0) {
869		panic("%s hard reset failed with error %d\n", __func__, error);
870	}
871
872	/* Restore the PCI state, including the MSI-X registers */
873	mpr_pci_restore(sc);
874
875	/* Give the I/O subsystem special priority to get itself prepared */
876	mprsas_handle_reinit(sc);
877
878	/*
879	 * Get IOC Facts and allocate all structures based on this information.
880	 * The attach function will also call mpr_iocfacts_allocate at startup.
881	 * If relevant values have changed in IOC Facts, this function will free
882	 * all of the memory based on IOC Facts and reallocate that memory.
883	 */
884	if ((error = mpr_iocfacts_allocate(sc, FALSE)) != 0) {
885		panic("%s IOC Facts based allocation failed with error %d\n",
886		    __func__, error);
887	}
888
889	/*
890	 * Mapping structures will be re-allocated after getting IOC Page8, so
891	 * free these structures here.
892	 */
893	mpr_mapping_exit(sc);
894
895	/*
896	 * The static page function currently read is IOC Page8.  Others can be
897	 * added in future.  It's possible that the values in IOC Page8 have
898	 * changed after a Diag Reset due to user modification, so always read
899	 * these.  Interrupts are masked, so unmask them before getting config
900	 * pages.
901	 */
902	mpr_unmask_intr(sc);
903	sc->mpr_flags &= ~MPR_FLAGS_DIAGRESET;
904	mpr_base_static_config_pages(sc);
905
906	/*
907	 * Some mapping info is based in IOC Page8 data, so re-initialize the
908	 * mapping tables.
909	 */
910	mpr_mapping_initialize(sc);
911
912	/*
913	 * Restart will reload the event masks clobbered by the reset, and
914	 * then enable the port.
915	 */
916	mpr_reregister_events(sc);
917
918	/* the end of discovery will release the simq, so we're done. */
919	mpr_dprint(sc, MPR_INIT|MPR_XINFO, "Finished sc %p post %u free %u\n",
920	    sc, sc->replypostindex, sc->replyfreeindex);
921	mprsas_release_simq_reinit(sassc);
922	mpr_dprint(sc, MPR_INIT, "%s exit error= %d\n", __func__, error);
923
924	return 0;
925}
926
927/* Wait for the chip to ACK a word that we've put into its FIFO
928 * Wait for <timeout> seconds. In single loop wait for busy loop
929 * for 500 microseconds.
930 * Total is [ 0.5 * (2000 * <timeout>) ] in miliseconds.
931 * */
932static int
933mpr_wait_db_ack(struct mpr_softc *sc, int timeout, int sleep_flag)
934{
935	u32 cntdn, count;
936	u32 int_status;
937	u32 doorbell;
938
939	count = 0;
940	cntdn = (sleep_flag == CAN_SLEEP) ? 1000*timeout : 2000*timeout;
941	do {
942		int_status = mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
943		if (!(int_status & MPI2_HIS_SYS2IOC_DB_STATUS)) {
944			mpr_dprint(sc, MPR_TRACE, "%s: successful count(%d), "
945			    "timeout(%d)\n", __func__, count, timeout);
946			return 0;
947		} else if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
948			doorbell = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
949			if ((doorbell & MPI2_IOC_STATE_MASK) ==
950			    MPI2_IOC_STATE_FAULT) {
951				mpr_dprint(sc, MPR_FAULT,
952				    "fault_state(0x%04x)!\n", doorbell);
953				return (EFAULT);
954			}
955		} else if (int_status == 0xFFFFFFFF)
956			goto out;
957
958		/*
959		 * If it can sleep, sleep for 1 milisecond, else busy loop for
960 		 * 0.5 milisecond
961		 */
962		if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
963			msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0, "mprdba",
964			    hz/1000);
965		else if (sleep_flag == CAN_SLEEP)
966			pause("mprdba", hz/1000);
967		else
968			DELAY(500);
969		count++;
970	} while (--cntdn);
971
972out:
973	mpr_dprint(sc, MPR_FAULT, "%s: failed due to timeout count(%d), "
974		"int_status(%x)!\n", __func__, count, int_status);
975	return (ETIMEDOUT);
976}
977
978/* Wait for the chip to signal that the next word in its FIFO can be fetched */
979static int
980mpr_wait_db_int(struct mpr_softc *sc)
981{
982	int retry;
983
984	for (retry = 0; retry < MPR_DB_MAX_WAIT; retry++) {
985		if ((mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET) &
986		    MPI2_HIS_IOC2SYS_DB_STATUS) != 0)
987			return (0);
988		DELAY(2000);
989	}
990	return (ETIMEDOUT);
991}
992
993/* Step through the synchronous command state machine, i.e. "Doorbell mode" */
994static int
995mpr_request_sync(struct mpr_softc *sc, void *req, MPI2_DEFAULT_REPLY *reply,
996    int req_sz, int reply_sz, int timeout)
997{
998	uint32_t *data32;
999	uint16_t *data16;
1000	int i, count, ioc_sz, residual;
1001	int sleep_flags = CAN_SLEEP;
1002
1003	if (curthread->td_no_sleeping)
1004		sleep_flags = NO_SLEEP;
1005
1006	/* Step 1 */
1007	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
1008
1009	/* Step 2 */
1010	if (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
1011		return (EBUSY);
1012
1013	/* Step 3
1014	 * Announce that a message is coming through the doorbell.  Messages
1015	 * are pushed at 32bit words, so round up if needed.
1016	 */
1017	count = (req_sz + 3) / 4;
1018	mpr_regwrite(sc, MPI2_DOORBELL_OFFSET,
1019	    (MPI2_FUNCTION_HANDSHAKE << MPI2_DOORBELL_FUNCTION_SHIFT) |
1020	    (count << MPI2_DOORBELL_ADD_DWORDS_SHIFT));
1021
1022	/* Step 4 */
1023	if (mpr_wait_db_int(sc) ||
1024	    (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) == 0) {
1025		mpr_dprint(sc, MPR_FAULT, "Doorbell failed to activate\n");
1026		return (ENXIO);
1027	}
1028	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
1029	if (mpr_wait_db_ack(sc, 5, sleep_flags) != 0) {
1030		mpr_dprint(sc, MPR_FAULT, "Doorbell handshake failed\n");
1031		return (ENXIO);
1032	}
1033
1034	/* Step 5 */
1035	/* Clock out the message data synchronously in 32-bit dwords*/
1036	data32 = (uint32_t *)req;
1037	for (i = 0; i < count; i++) {
1038		mpr_regwrite(sc, MPI2_DOORBELL_OFFSET, htole32(data32[i]));
1039		if (mpr_wait_db_ack(sc, 5, sleep_flags) != 0) {
1040			mpr_dprint(sc, MPR_FAULT,
1041			    "Timeout while writing doorbell\n");
1042			return (ENXIO);
1043		}
1044	}
1045
1046	/* Step 6 */
1047	/* Clock in the reply in 16-bit words.  The total length of the
1048	 * message is always in the 4th byte, so clock out the first 2 words
1049	 * manually, then loop the rest.
1050	 */
1051	data16 = (uint16_t *)reply;
1052	if (mpr_wait_db_int(sc) != 0) {
1053		mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell 0\n");
1054		return (ENXIO);
1055	}
1056
1057	/*
1058	 * If in a BE platform, swap bytes using le16toh to not
1059	 * disturb 8 bit field neighbors in destination structure
1060	 * pointed by data16.
1061	 */
1062	data16[0] =
1063	    le16toh(mpr_regread(sc, MPI2_DOORBELL_OFFSET)) & MPI2_DOORBELL_DATA_MASK;
1064	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
1065	if (mpr_wait_db_int(sc) != 0) {
1066		mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell 1\n");
1067		return (ENXIO);
1068	}
1069	data16[1] =
1070	    le16toh(mpr_regread(sc, MPI2_DOORBELL_OFFSET)) & MPI2_DOORBELL_DATA_MASK;
1071	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
1072
1073	/* Number of 32bit words in the message */
1074	ioc_sz = reply->MsgLength;
1075
1076	/*
1077	 * Figure out how many 16bit words to clock in without overrunning.
1078	 * The precision loss with dividing reply_sz can safely be
1079	 * ignored because the messages can only be multiples of 32bits.
1080	 */
1081	residual = 0;
1082	count = MIN((reply_sz / 4), ioc_sz) * 2;
1083	if (count < ioc_sz * 2) {
1084		residual = ioc_sz * 2 - count;
1085		mpr_dprint(sc, MPR_ERROR, "Driver error, throwing away %d "
1086		    "residual message words\n", residual);
1087	}
1088
1089	for (i = 2; i < count; i++) {
1090		if (mpr_wait_db_int(sc) != 0) {
1091			mpr_dprint(sc, MPR_FAULT,
1092			    "Timeout reading doorbell %d\n", i);
1093			return (ENXIO);
1094		}
1095		data16[i] = le16toh(mpr_regread(sc, MPI2_DOORBELL_OFFSET)) &
1096		    MPI2_DOORBELL_DATA_MASK;
1097		mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
1098	}
1099
1100	/*
1101	 * Pull out residual words that won't fit into the provided buffer.
1102	 * This keeps the chip from hanging due to a driver programming
1103	 * error.
1104	 */
1105	while (residual--) {
1106		if (mpr_wait_db_int(sc) != 0) {
1107			mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell\n");
1108			return (ENXIO);
1109		}
1110		(void)mpr_regread(sc, MPI2_DOORBELL_OFFSET);
1111		mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
1112	}
1113
1114	/* Step 7 */
1115	if (mpr_wait_db_int(sc) != 0) {
1116		mpr_dprint(sc, MPR_FAULT, "Timeout waiting to exit doorbell\n");
1117		return (ENXIO);
1118	}
1119	if (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
1120		mpr_dprint(sc, MPR_FAULT, "Warning, doorbell still active\n");
1121	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
1122
1123	return (0);
1124}
1125
1126static void
1127mpr_enqueue_request(struct mpr_softc *sc, struct mpr_command *cm)
1128{
1129	request_descriptor_t rd;
1130
1131	MPR_FUNCTRACE(sc);
1132	mpr_dprint(sc, MPR_TRACE, "SMID %u cm %p ccb %p\n",
1133	    cm->cm_desc.Default.SMID, cm, cm->cm_ccb);
1134
1135	if (sc->mpr_flags & MPR_FLAGS_ATTACH_DONE && !(sc->mpr_flags &
1136	    MPR_FLAGS_SHUTDOWN))
1137		mtx_assert(&sc->mpr_mtx, MA_OWNED);
1138
1139	if (++sc->io_cmds_active > sc->io_cmds_highwater)
1140		sc->io_cmds_highwater++;
1141
1142	KASSERT(cm->cm_state == MPR_CM_STATE_BUSY,
1143	    ("command not busy, state = %u\n", cm->cm_state));
1144	cm->cm_state = MPR_CM_STATE_INQUEUE;
1145
1146	if (sc->atomic_desc_capable) {
1147		rd.u.low = cm->cm_desc.Words.Low;
1148		mpr_regwrite(sc, MPI26_ATOMIC_REQUEST_DESCRIPTOR_POST_OFFSET,
1149		    rd.u.low);
1150	} else {
1151		rd.u.low = htole32(cm->cm_desc.Words.Low);
1152		rd.u.high = htole32(cm->cm_desc.Words.High);
1153		mpr_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_LOW_OFFSET,
1154		    rd.u.low);
1155		mpr_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_HIGH_OFFSET,
1156		    rd.u.high);
1157	}
1158}
1159
1160/*
1161 * Ioc facts are read in 16 bit words and and stored with le16toh,
1162 * this takes care of proper U8 fields endianness in
1163 * MPI2_IOC_FACTS_REPLY, but we still need to swap back U16 fields.
1164 */
1165static void
1166adjust_iocfacts_endianness(MPI2_IOC_FACTS_REPLY *facts)
1167{
1168	facts->HeaderVersion = le16toh(facts->HeaderVersion);
1169	facts->Reserved1 = le16toh(facts->Reserved1);
1170	facts->IOCExceptions = le16toh(facts->IOCExceptions);
1171	facts->IOCStatus = le16toh(facts->IOCStatus);
1172	facts->IOCLogInfo = le32toh(facts->IOCLogInfo);
1173	facts->RequestCredit = le16toh(facts->RequestCredit);
1174	facts->ProductID = le16toh(facts->ProductID);
1175	facts->IOCCapabilities = le32toh(facts->IOCCapabilities);
1176	facts->IOCRequestFrameSize = le16toh(facts->IOCRequestFrameSize);
1177	facts->IOCMaxChainSegmentSize = le16toh(facts->IOCMaxChainSegmentSize);
1178	facts->MaxInitiators = le16toh(facts->MaxInitiators);
1179	facts->MaxTargets = le16toh(facts->MaxTargets);
1180	facts->MaxSasExpanders = le16toh(facts->MaxSasExpanders);
1181	facts->MaxEnclosures = le16toh(facts->MaxEnclosures);
1182	facts->ProtocolFlags = le16toh(facts->ProtocolFlags);
1183	facts->HighPriorityCredit = le16toh(facts->HighPriorityCredit);
1184	facts->MaxReplyDescriptorPostQueueDepth = le16toh(facts->MaxReplyDescriptorPostQueueDepth);
1185	facts->MaxDevHandle = le16toh(facts->MaxDevHandle);
1186	facts->MaxPersistentEntries = le16toh(facts->MaxPersistentEntries);
1187	facts->MinDevHandle = le16toh(facts->MinDevHandle);
1188}
1189
1190/*
1191 * Just the FACTS, ma'am.
1192 */
1193static int
1194mpr_get_iocfacts(struct mpr_softc *sc, MPI2_IOC_FACTS_REPLY *facts)
1195{
1196	MPI2_DEFAULT_REPLY *reply;
1197	MPI2_IOC_FACTS_REQUEST request;
1198	int error, req_sz, reply_sz;
1199
1200	MPR_FUNCTRACE(sc);
1201	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
1202
1203	req_sz = sizeof(MPI2_IOC_FACTS_REQUEST);
1204	reply_sz = sizeof(MPI2_IOC_FACTS_REPLY);
1205	reply = (MPI2_DEFAULT_REPLY *)facts;
1206
1207	bzero(&request, req_sz);
1208	request.Function = MPI2_FUNCTION_IOC_FACTS;
1209	error = mpr_request_sync(sc, &request, reply, req_sz, reply_sz, 5);
1210
1211	adjust_iocfacts_endianness(facts);
1212	mpr_dprint(sc, MPR_TRACE, "facts->IOCCapabilities 0x%x\n", facts->IOCCapabilities);
1213
1214	mpr_dprint(sc, MPR_INIT, "%s exit, error= %d\n", __func__, error);
1215	return (error);
1216}
1217
1218static int
1219mpr_send_iocinit(struct mpr_softc *sc)
1220{
1221	MPI2_IOC_INIT_REQUEST	init;
1222	MPI2_DEFAULT_REPLY	reply;
1223	int req_sz, reply_sz, error;
1224	struct timeval now;
1225	uint64_t time_in_msec;
1226
1227	MPR_FUNCTRACE(sc);
1228	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
1229
1230	/* Do a quick sanity check on proper initialization */
1231	if ((sc->pqdepth == 0) || (sc->fqdepth == 0) || (sc->reqframesz == 0)
1232	    || (sc->replyframesz == 0)) {
1233		mpr_dprint(sc, MPR_INIT|MPR_ERROR,
1234		    "Driver not fully initialized for IOCInit\n");
1235		return (EINVAL);
1236	}
1237
1238	req_sz = sizeof(MPI2_IOC_INIT_REQUEST);
1239	reply_sz = sizeof(MPI2_IOC_INIT_REPLY);
1240	bzero(&init, req_sz);
1241	bzero(&reply, reply_sz);
1242
1243	/*
1244	 * Fill in the init block.  Note that most addresses are
1245	 * deliberately in the lower 32bits of memory.  This is a micro-
1246	 * optimzation for PCI/PCIX, though it's not clear if it helps PCIe.
1247	 */
1248	init.Function = MPI2_FUNCTION_IOC_INIT;
1249	init.WhoInit = MPI2_WHOINIT_HOST_DRIVER;
1250	init.MsgVersion = htole16(MPI2_VERSION);
1251	init.HeaderVersion = htole16(MPI2_HEADER_VERSION);
1252	init.SystemRequestFrameSize = htole16((uint16_t)(sc->reqframesz / 4));
1253	init.ReplyDescriptorPostQueueDepth = htole16(sc->pqdepth);
1254	init.ReplyFreeQueueDepth = htole16(sc->fqdepth);
1255	init.SenseBufferAddressHigh = 0;
1256	init.SystemReplyAddressHigh = 0;
1257	init.SystemRequestFrameBaseAddress.High = 0;
1258	init.SystemRequestFrameBaseAddress.Low =
1259	    htole32((uint32_t)sc->req_busaddr);
1260	init.ReplyDescriptorPostQueueAddress.High = 0;
1261	init.ReplyDescriptorPostQueueAddress.Low =
1262	    htole32((uint32_t)sc->post_busaddr);
1263	init.ReplyFreeQueueAddress.High = 0;
1264	init.ReplyFreeQueueAddress.Low = htole32((uint32_t)sc->free_busaddr);
1265	getmicrotime(&now);
1266	time_in_msec = (now.tv_sec * 1000 + now.tv_usec/1000);
1267	init.TimeStamp.High = htole32((time_in_msec >> 32) & 0xFFFFFFFF);
1268	init.TimeStamp.Low = htole32(time_in_msec & 0xFFFFFFFF);
1269	init.HostPageSize = HOST_PAGE_SIZE_4K;
1270
1271	error = mpr_request_sync(sc, &init, &reply, req_sz, reply_sz, 5);
1272	if ((le16toh(reply.IOCStatus) & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
1273		error = ENXIO;
1274
1275	mpr_dprint(sc, MPR_INIT, "IOCInit status= 0x%x\n", le16toh(reply.IOCStatus));
1276	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
1277	return (error);
1278}
1279
1280void
1281mpr_memaddr_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
1282{
1283	bus_addr_t *addr;
1284
1285	addr = arg;
1286	*addr = segs[0].ds_addr;
1287}
1288
1289void
1290mpr_memaddr_wait_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
1291{
1292	struct mpr_busdma_context *ctx;
1293	int need_unload, need_free;
1294
1295	ctx = (struct mpr_busdma_context *)arg;
1296	need_unload = 0;
1297	need_free = 0;
1298
1299	mpr_lock(ctx->softc);
1300	ctx->error = error;
1301	ctx->completed = 1;
1302	if ((error == 0) && (ctx->abandoned == 0)) {
1303		*ctx->addr = segs[0].ds_addr;
1304	} else {
1305		if (nsegs != 0)
1306			need_unload = 1;
1307		if (ctx->abandoned != 0)
1308			need_free = 1;
1309	}
1310	if (need_free == 0)
1311		wakeup(ctx);
1312
1313	mpr_unlock(ctx->softc);
1314
1315	if (need_unload != 0) {
1316		bus_dmamap_unload(ctx->buffer_dmat,
1317				  ctx->buffer_dmamap);
1318		*ctx->addr = 0;
1319	}
1320
1321	if (need_free != 0)
1322		free(ctx, M_MPR);
1323}
1324
1325static int
1326mpr_alloc_queues(struct mpr_softc *sc)
1327{
1328	struct mpr_queue *q;
1329	int nq, i;
1330
1331	nq = sc->msi_msgs;
1332	mpr_dprint(sc, MPR_INIT|MPR_XINFO, "Allocating %d I/O queues\n", nq);
1333
1334	sc->queues = malloc(sizeof(struct mpr_queue) * nq, M_MPR,
1335	     M_NOWAIT|M_ZERO);
1336	if (sc->queues == NULL)
1337		return (ENOMEM);
1338
1339	for (i = 0; i < nq; i++) {
1340		q = &sc->queues[i];
1341		mpr_dprint(sc, MPR_INIT, "Configuring queue %d %p\n", i, q);
1342		q->sc = sc;
1343		q->qnum = i;
1344	}
1345	return (0);
1346}
1347
1348static int
1349mpr_alloc_hw_queues(struct mpr_softc *sc)
1350{
1351	bus_dma_template_t t;
1352	bus_addr_t queues_busaddr;
1353	uint8_t *queues;
1354	int qsize, fqsize, pqsize;
1355
1356	/*
1357	 * The reply free queue contains 4 byte entries in multiples of 16 and
1358	 * aligned on a 16 byte boundary. There must always be an unused entry.
1359	 * This queue supplies fresh reply frames for the firmware to use.
1360	 *
1361	 * The reply descriptor post queue contains 8 byte entries in
1362	 * multiples of 16 and aligned on a 16 byte boundary.  This queue
1363	 * contains filled-in reply frames sent from the firmware to the host.
1364	 *
1365	 * These two queues are allocated together for simplicity.
1366	 */
1367	sc->fqdepth = roundup2(sc->num_replies + 1, 16);
1368	sc->pqdepth = roundup2(sc->num_replies + 1, 16);
1369	fqsize= sc->fqdepth * 4;
1370	pqsize = sc->pqdepth * 8;
1371	qsize = fqsize + pqsize;
1372
1373	bus_dma_template_init(&t, sc->mpr_parent_dmat);
1374	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(qsize),
1375	    BD_MAXSEGSIZE(qsize), BD_NSEGMENTS(1),
1376	    BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
1377	if (bus_dma_template_tag(&t, &sc->queues_dmat)) {
1378		mpr_dprint(sc, MPR_ERROR, "Cannot allocate queues DMA tag\n");
1379		return (ENOMEM);
1380        }
1381        if (bus_dmamem_alloc(sc->queues_dmat, (void **)&queues, BUS_DMA_NOWAIT,
1382	    &sc->queues_map)) {
1383		mpr_dprint(sc, MPR_ERROR, "Cannot allocate queues memory\n");
1384		return (ENOMEM);
1385        }
1386        bzero(queues, qsize);
1387        bus_dmamap_load(sc->queues_dmat, sc->queues_map, queues, qsize,
1388	    mpr_memaddr_cb, &queues_busaddr, 0);
1389
1390	sc->free_queue = (uint32_t *)queues;
1391	sc->free_busaddr = queues_busaddr;
1392	sc->post_queue = (MPI2_REPLY_DESCRIPTORS_UNION *)(queues + fqsize);
1393	sc->post_busaddr = queues_busaddr + fqsize;
1394	mpr_dprint(sc, MPR_INIT, "free queue busaddr= %#016jx size= %d\n",
1395	    (uintmax_t)sc->free_busaddr, fqsize);
1396	mpr_dprint(sc, MPR_INIT, "reply queue busaddr= %#016jx size= %d\n",
1397	    (uintmax_t)sc->post_busaddr, pqsize);
1398
1399	return (0);
1400}
1401
1402static int
1403mpr_alloc_replies(struct mpr_softc *sc)
1404{
1405	bus_dma_template_t t;
1406	int rsize, num_replies;
1407
1408	/* Store the reply frame size in bytes rather than as 32bit words */
1409	sc->replyframesz = sc->facts->ReplyFrameSize * 4;
1410
1411	/*
1412	 * sc->num_replies should be one less than sc->fqdepth.  We need to
1413	 * allocate space for sc->fqdepth replies, but only sc->num_replies
1414	 * replies can be used at once.
1415	 */
1416	num_replies = max(sc->fqdepth, sc->num_replies);
1417
1418	rsize = sc->replyframesz * num_replies;
1419	bus_dma_template_init(&t, sc->mpr_parent_dmat);
1420	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(4), BD_MAXSIZE(rsize),
1421	    BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
1422	    BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
1423	if (bus_dma_template_tag(&t, &sc->reply_dmat)) {
1424		mpr_dprint(sc, MPR_ERROR, "Cannot allocate replies DMA tag\n");
1425		return (ENOMEM);
1426        }
1427        if (bus_dmamem_alloc(sc->reply_dmat, (void **)&sc->reply_frames,
1428	    BUS_DMA_NOWAIT, &sc->reply_map)) {
1429		mpr_dprint(sc, MPR_ERROR, "Cannot allocate replies memory\n");
1430		return (ENOMEM);
1431        }
1432        bzero(sc->reply_frames, rsize);
1433        bus_dmamap_load(sc->reply_dmat, sc->reply_map, sc->reply_frames, rsize,
1434	    mpr_memaddr_cb, &sc->reply_busaddr, 0);
1435	mpr_dprint(sc, MPR_INIT, "reply frames busaddr= %#016jx size= %d\n",
1436	    (uintmax_t)sc->reply_busaddr, rsize);
1437
1438	return (0);
1439}
1440
1441static void
1442mpr_load_chains_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
1443{
1444	struct mpr_softc *sc = arg;
1445	struct mpr_chain *chain;
1446	bus_size_t bo;
1447	int i, o, s;
1448
1449	if (error != 0)
1450		return;
1451
1452	for (i = 0, o = 0, s = 0; s < nsegs; s++) {
1453		for (bo = 0; bo + sc->chain_frame_size <= segs[s].ds_len;
1454		    bo += sc->chain_frame_size) {
1455			chain = &sc->chains[i++];
1456			chain->chain =(MPI2_SGE_IO_UNION *)(sc->chain_frames+o);
1457			chain->chain_busaddr = segs[s].ds_addr + bo;
1458			o += sc->chain_frame_size;
1459			mpr_free_chain(sc, chain);
1460		}
1461		if (bo != segs[s].ds_len)
1462			o += segs[s].ds_len - bo;
1463	}
1464	sc->chain_free_lowwater = i;
1465}
1466
1467static int
1468mpr_alloc_requests(struct mpr_softc *sc)
1469{
1470	bus_dma_template_t t;
1471	struct mpr_command *cm;
1472	int i, rsize, nsegs;
1473
1474	rsize = sc->reqframesz * sc->num_reqs;
1475	bus_dma_template_init(&t, sc->mpr_parent_dmat);
1476	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(rsize),
1477	    BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
1478	    BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
1479	if (bus_dma_template_tag(&t, &sc->req_dmat)) {
1480		mpr_dprint(sc, MPR_ERROR, "Cannot allocate request DMA tag\n");
1481		return (ENOMEM);
1482        }
1483        if (bus_dmamem_alloc(sc->req_dmat, (void **)&sc->req_frames,
1484	    BUS_DMA_NOWAIT, &sc->req_map)) {
1485		mpr_dprint(sc, MPR_ERROR, "Cannot allocate request memory\n");
1486		return (ENOMEM);
1487        }
1488        bzero(sc->req_frames, rsize);
1489        bus_dmamap_load(sc->req_dmat, sc->req_map, sc->req_frames, rsize,
1490	    mpr_memaddr_cb, &sc->req_busaddr, 0);
1491	mpr_dprint(sc, MPR_INIT, "request frames busaddr= %#016jx size= %d\n",
1492	    (uintmax_t)sc->req_busaddr, rsize);
1493
1494	sc->chains = malloc(sizeof(struct mpr_chain) * sc->num_chains, M_MPR,
1495	    M_NOWAIT | M_ZERO);
1496	if (!sc->chains) {
1497		mpr_dprint(sc, MPR_ERROR, "Cannot allocate chain memory\n");
1498		return (ENOMEM);
1499	}
1500	rsize = sc->chain_frame_size * sc->num_chains;
1501	bus_dma_template_init(&t, sc->mpr_parent_dmat);
1502	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(rsize),
1503	    BD_MAXSEGSIZE(rsize), BD_NSEGMENTS((howmany(rsize, PAGE_SIZE))),
1504	    BD_BOUNDARY(BUS_SPACE_MAXSIZE_32BIT+1));
1505	if (bus_dma_template_tag(&t, &sc->chain_dmat)) {
1506		mpr_dprint(sc, MPR_ERROR, "Cannot allocate chain DMA tag\n");
1507		return (ENOMEM);
1508	}
1509	if (bus_dmamem_alloc(sc->chain_dmat, (void **)&sc->chain_frames,
1510	    BUS_DMA_NOWAIT | BUS_DMA_ZERO, &sc->chain_map)) {
1511		mpr_dprint(sc, MPR_ERROR, "Cannot allocate chain memory\n");
1512		return (ENOMEM);
1513	}
1514	if (bus_dmamap_load(sc->chain_dmat, sc->chain_map, sc->chain_frames,
1515	    rsize, mpr_load_chains_cb, sc, BUS_DMA_NOWAIT)) {
1516		mpr_dprint(sc, MPR_ERROR, "Cannot load chain memory\n");
1517		bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
1518		    sc->chain_map);
1519		return (ENOMEM);
1520	}
1521
1522	rsize = MPR_SENSE_LEN * sc->num_reqs;
1523	bus_dma_template_clone(&t, sc->req_dmat);
1524	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(1), BD_MAXSIZE(rsize),
1525	    BD_MAXSEGSIZE(rsize));
1526	if (bus_dma_template_tag(&t, &sc->sense_dmat)) {
1527		mpr_dprint(sc, MPR_ERROR, "Cannot allocate sense DMA tag\n");
1528		return (ENOMEM);
1529        }
1530        if (bus_dmamem_alloc(sc->sense_dmat, (void **)&sc->sense_frames,
1531	    BUS_DMA_NOWAIT, &sc->sense_map)) {
1532		mpr_dprint(sc, MPR_ERROR, "Cannot allocate sense memory\n");
1533		return (ENOMEM);
1534        }
1535        bzero(sc->sense_frames, rsize);
1536        bus_dmamap_load(sc->sense_dmat, sc->sense_map, sc->sense_frames, rsize,
1537	    mpr_memaddr_cb, &sc->sense_busaddr, 0);
1538	mpr_dprint(sc, MPR_INIT, "sense frames busaddr= %#016jx size= %d\n",
1539	    (uintmax_t)sc->sense_busaddr, rsize);
1540
1541	/*
1542	 * Allocate NVMe PRP Pages for NVMe SGL support only if the FW supports
1543	 * these devices.
1544	 */
1545	if ((sc->facts->MsgVersion >= MPI2_VERSION_02_06) &&
1546	    (sc->facts->ProtocolFlags & MPI2_IOCFACTS_PROTOCOL_NVME_DEVICES)) {
1547		if (mpr_alloc_nvme_prp_pages(sc) == ENOMEM)
1548			return (ENOMEM);
1549	}
1550
1551	nsegs = (sc->maxio / PAGE_SIZE) + 1;
1552	bus_dma_template_init(&t, sc->mpr_parent_dmat);
1553	BUS_DMA_TEMPLATE_FILL(&t, BD_MAXSIZE(BUS_SPACE_MAXSIZE_32BIT),
1554	    BD_NSEGMENTS(nsegs), BD_MAXSEGSIZE(BUS_SPACE_MAXSIZE_32BIT),
1555	    BD_FLAGS(BUS_DMA_ALLOCNOW), BD_LOCKFUNC(busdma_lock_mutex),
1556	    BD_LOCKFUNCARG(&sc->mpr_mtx),
1557	    BD_BOUNDARY(BUS_SPACE_MAXSIZE_32BIT+1));
1558	if (bus_dma_template_tag(&t, &sc->buffer_dmat)) {
1559		mpr_dprint(sc, MPR_ERROR, "Cannot allocate buffer DMA tag\n");
1560		return (ENOMEM);
1561        }
1562
1563	/*
1564	 * SMID 0 cannot be used as a free command per the firmware spec.
1565	 * Just drop that command instead of risking accounting bugs.
1566	 */
1567	sc->commands = malloc(sizeof(struct mpr_command) * sc->num_reqs,
1568	    M_MPR, M_WAITOK | M_ZERO);
1569	for (i = 1; i < sc->num_reqs; i++) {
1570		cm = &sc->commands[i];
1571		cm->cm_req = sc->req_frames + i * sc->reqframesz;
1572		cm->cm_req_busaddr = sc->req_busaddr + i * sc->reqframesz;
1573		cm->cm_sense = &sc->sense_frames[i];
1574		cm->cm_sense_busaddr = sc->sense_busaddr + i * MPR_SENSE_LEN;
1575		cm->cm_desc.Default.SMID = htole16(i);
1576		cm->cm_sc = sc;
1577		cm->cm_state = MPR_CM_STATE_BUSY;
1578		TAILQ_INIT(&cm->cm_chain_list);
1579		TAILQ_INIT(&cm->cm_prp_page_list);
1580		callout_init_mtx(&cm->cm_callout, &sc->mpr_mtx, 0);
1581
1582		/* XXX Is a failure here a critical problem? */
1583		if (bus_dmamap_create(sc->buffer_dmat, 0, &cm->cm_dmamap)
1584		    == 0) {
1585			if (i <= sc->num_prireqs)
1586				mpr_free_high_priority_command(sc, cm);
1587			else
1588				mpr_free_command(sc, cm);
1589		} else {
1590			panic("failed to allocate command %d\n", i);
1591			sc->num_reqs = i;
1592			break;
1593		}
1594	}
1595
1596	return (0);
1597}
1598
1599/*
1600 * Allocate contiguous buffers for PCIe NVMe devices for building native PRPs,
1601 * which are scatter/gather lists for NVMe devices.
1602 *
1603 * This buffer must be contiguous due to the nature of how NVMe PRPs are built
1604 * and translated by FW.
1605 *
1606 * returns ENOMEM if memory could not be allocated, otherwise returns 0.
1607 */
1608static int
1609mpr_alloc_nvme_prp_pages(struct mpr_softc *sc)
1610{
1611	bus_dma_template_t t;
1612	struct mpr_prp_page *prp_page;
1613	int PRPs_per_page, PRPs_required, pages_required;
1614	int rsize, i;
1615
1616	/*
1617	 * Assuming a MAX_IO_SIZE of 1MB and a PAGE_SIZE of 4k, the max number
1618	 * of PRPs (NVMe's Scatter/Gather Element) needed per I/O is:
1619	 * MAX_IO_SIZE / PAGE_SIZE = 256
1620	 *
1621	 * 1 PRP entry in main frame for PRP list pointer still leaves 255 PRPs
1622	 * required for the remainder of the 1MB I/O. 512 PRPs can fit into one
1623	 * page (4096 / 8 = 512), so only one page is required for each I/O.
1624	 *
1625	 * Each of these buffers will need to be contiguous. For simplicity,
1626	 * only one buffer is allocated here, which has all of the space
1627	 * required for the NVMe Queue Depth. If there are problems allocating
1628	 * this one buffer, this function will need to change to allocate
1629	 * individual, contiguous NVME_QDEPTH buffers.
1630	 *
1631	 * The real calculation will use the real max io size. Above is just an
1632	 * example.
1633	 *
1634	 */
1635	PRPs_required = sc->maxio / PAGE_SIZE;
1636	PRPs_per_page = (PAGE_SIZE / PRP_ENTRY_SIZE) - 1;
1637	pages_required = (PRPs_required / PRPs_per_page) + 1;
1638
1639	sc->prp_buffer_size = PAGE_SIZE * pages_required;
1640	rsize = sc->prp_buffer_size * NVME_QDEPTH;
1641	bus_dma_template_init(&t, sc->mpr_parent_dmat);
1642	BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(4), BD_MAXSIZE(rsize),
1643	    BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
1644	    BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
1645	if (bus_dma_template_tag(&t, &sc->prp_page_dmat)) {
1646		mpr_dprint(sc, MPR_ERROR, "Cannot allocate NVMe PRP DMA "
1647		    "tag\n");
1648		return (ENOMEM);
1649	}
1650	if (bus_dmamem_alloc(sc->prp_page_dmat, (void **)&sc->prp_pages,
1651	    BUS_DMA_NOWAIT, &sc->prp_page_map)) {
1652		mpr_dprint(sc, MPR_ERROR, "Cannot allocate NVMe PRP memory\n");
1653		return (ENOMEM);
1654	}
1655	bzero(sc->prp_pages, rsize);
1656	bus_dmamap_load(sc->prp_page_dmat, sc->prp_page_map, sc->prp_pages,
1657	    rsize, mpr_memaddr_cb, &sc->prp_page_busaddr, 0);
1658
1659	sc->prps = malloc(sizeof(struct mpr_prp_page) * NVME_QDEPTH, M_MPR,
1660	    M_WAITOK | M_ZERO);
1661	for (i = 0; i < NVME_QDEPTH; i++) {
1662		prp_page = &sc->prps[i];
1663		prp_page->prp_page = (uint64_t *)(sc->prp_pages +
1664		    i * sc->prp_buffer_size);
1665		prp_page->prp_page_busaddr = (uint64_t)(sc->prp_page_busaddr +
1666		    i * sc->prp_buffer_size);
1667		mpr_free_prp_page(sc, prp_page);
1668		sc->prp_pages_free_lowwater++;
1669	}
1670
1671	return (0);
1672}
1673
1674static int
1675mpr_init_queues(struct mpr_softc *sc)
1676{
1677	int i;
1678
1679	memset((uint8_t *)sc->post_queue, 0xff, sc->pqdepth * 8);
1680
1681	/*
1682	 * According to the spec, we need to use one less reply than we
1683	 * have space for on the queue.  So sc->num_replies (the number we
1684	 * use) should be less than sc->fqdepth (allocated size).
1685	 */
1686	if (sc->num_replies >= sc->fqdepth)
1687		return (EINVAL);
1688
1689	/*
1690	 * Initialize all of the free queue entries.
1691	 */
1692	for (i = 0; i < sc->fqdepth; i++) {
1693		sc->free_queue[i] = htole32(sc->reply_busaddr + (i * sc->replyframesz));
1694	}
1695	sc->replyfreeindex = sc->num_replies;
1696
1697	return (0);
1698}
1699
1700/* Get the driver parameter tunables.  Lowest priority are the driver defaults.
1701 * Next are the global settings, if they exist.  Highest are the per-unit
1702 * settings, if they exist.
1703 */
1704void
1705mpr_get_tunables(struct mpr_softc *sc)
1706{
1707	char tmpstr[80], mpr_debug[80];
1708
1709	/* XXX default to some debugging for now */
1710	sc->mpr_debug = MPR_INFO | MPR_FAULT;
1711	sc->disable_msix = 0;
1712	sc->disable_msi = 0;
1713	sc->max_msix = MPR_MSIX_MAX;
1714	sc->max_chains = MPR_CHAIN_FRAMES;
1715	sc->max_io_pages = MPR_MAXIO_PAGES;
1716	sc->enable_ssu = MPR_SSU_ENABLE_SSD_DISABLE_HDD;
1717	sc->spinup_wait_time = DEFAULT_SPINUP_WAIT;
1718	sc->use_phynum = 1;
1719	sc->max_reqframes = MPR_REQ_FRAMES;
1720	sc->max_prireqframes = MPR_PRI_REQ_FRAMES;
1721	sc->max_replyframes = MPR_REPLY_FRAMES;
1722	sc->max_evtframes = MPR_EVT_REPLY_FRAMES;
1723
1724	/*
1725	 * Grab the global variables.
1726	 */
1727	bzero(mpr_debug, 80);
1728	if (TUNABLE_STR_FETCH("hw.mpr.debug_level", mpr_debug, 80) != 0)
1729		mpr_parse_debug(sc, mpr_debug);
1730	TUNABLE_INT_FETCH("hw.mpr.disable_msix", &sc->disable_msix);
1731	TUNABLE_INT_FETCH("hw.mpr.disable_msi", &sc->disable_msi);
1732	TUNABLE_INT_FETCH("hw.mpr.max_msix", &sc->max_msix);
1733	TUNABLE_INT_FETCH("hw.mpr.max_chains", &sc->max_chains);
1734	TUNABLE_INT_FETCH("hw.mpr.max_io_pages", &sc->max_io_pages);
1735	TUNABLE_INT_FETCH("hw.mpr.enable_ssu", &sc->enable_ssu);
1736	TUNABLE_INT_FETCH("hw.mpr.spinup_wait_time", &sc->spinup_wait_time);
1737	TUNABLE_INT_FETCH("hw.mpr.use_phy_num", &sc->use_phynum);
1738	TUNABLE_INT_FETCH("hw.mpr.max_reqframes", &sc->max_reqframes);
1739	TUNABLE_INT_FETCH("hw.mpr.max_prireqframes", &sc->max_prireqframes);
1740	TUNABLE_INT_FETCH("hw.mpr.max_replyframes", &sc->max_replyframes);
1741	TUNABLE_INT_FETCH("hw.mpr.max_evtframes", &sc->max_evtframes);
1742
1743	/* Grab the unit-instance variables */
1744	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.debug_level",
1745	    device_get_unit(sc->mpr_dev));
1746	bzero(mpr_debug, 80);
1747	if (TUNABLE_STR_FETCH(tmpstr, mpr_debug, 80) != 0)
1748		mpr_parse_debug(sc, mpr_debug);
1749
1750	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.disable_msix",
1751	    device_get_unit(sc->mpr_dev));
1752	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msix);
1753
1754	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.disable_msi",
1755	    device_get_unit(sc->mpr_dev));
1756	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msi);
1757
1758	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_msix",
1759	    device_get_unit(sc->mpr_dev));
1760	TUNABLE_INT_FETCH(tmpstr, &sc->max_msix);
1761
1762	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_chains",
1763	    device_get_unit(sc->mpr_dev));
1764	TUNABLE_INT_FETCH(tmpstr, &sc->max_chains);
1765
1766	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_io_pages",
1767	    device_get_unit(sc->mpr_dev));
1768	TUNABLE_INT_FETCH(tmpstr, &sc->max_io_pages);
1769
1770	bzero(sc->exclude_ids, sizeof(sc->exclude_ids));
1771	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.exclude_ids",
1772	    device_get_unit(sc->mpr_dev));
1773	TUNABLE_STR_FETCH(tmpstr, sc->exclude_ids, sizeof(sc->exclude_ids));
1774
1775	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.enable_ssu",
1776	    device_get_unit(sc->mpr_dev));
1777	TUNABLE_INT_FETCH(tmpstr, &sc->enable_ssu);
1778
1779	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.spinup_wait_time",
1780	    device_get_unit(sc->mpr_dev));
1781	TUNABLE_INT_FETCH(tmpstr, &sc->spinup_wait_time);
1782
1783	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.use_phy_num",
1784	    device_get_unit(sc->mpr_dev));
1785	TUNABLE_INT_FETCH(tmpstr, &sc->use_phynum);
1786
1787	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_reqframes",
1788	    device_get_unit(sc->mpr_dev));
1789	TUNABLE_INT_FETCH(tmpstr, &sc->max_reqframes);
1790
1791	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_prireqframes",
1792	    device_get_unit(sc->mpr_dev));
1793	TUNABLE_INT_FETCH(tmpstr, &sc->max_prireqframes);
1794
1795	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_replyframes",
1796	    device_get_unit(sc->mpr_dev));
1797	TUNABLE_INT_FETCH(tmpstr, &sc->max_replyframes);
1798
1799	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_evtframes",
1800	    device_get_unit(sc->mpr_dev));
1801	TUNABLE_INT_FETCH(tmpstr, &sc->max_evtframes);
1802}
1803
1804static void
1805mpr_setup_sysctl(struct mpr_softc *sc)
1806{
1807	struct sysctl_ctx_list	*sysctl_ctx = NULL;
1808	struct sysctl_oid	*sysctl_tree = NULL;
1809	char tmpstr[80], tmpstr2[80];
1810
1811	/*
1812	 * Setup the sysctl variable so the user can change the debug level
1813	 * on the fly.
1814	 */
1815	snprintf(tmpstr, sizeof(tmpstr), "MPR controller %d",
1816	    device_get_unit(sc->mpr_dev));
1817	snprintf(tmpstr2, sizeof(tmpstr2), "%d", device_get_unit(sc->mpr_dev));
1818
1819	sysctl_ctx = device_get_sysctl_ctx(sc->mpr_dev);
1820	if (sysctl_ctx != NULL)
1821		sysctl_tree = device_get_sysctl_tree(sc->mpr_dev);
1822
1823	if (sysctl_tree == NULL) {
1824		sysctl_ctx_init(&sc->sysctl_ctx);
1825		sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
1826		    SYSCTL_STATIC_CHILDREN(_hw_mpr), OID_AUTO, tmpstr2,
1827		    CTLFLAG_RD | CTLFLAG_MPSAFE, 0, tmpstr);
1828		if (sc->sysctl_tree == NULL)
1829			return;
1830		sysctl_ctx = &sc->sysctl_ctx;
1831		sysctl_tree = sc->sysctl_tree;
1832	}
1833
1834	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1835	    OID_AUTO, "debug_level", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
1836	    sc, 0, mpr_debug_sysctl, "A", "mpr debug level");
1837
1838	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1839	    OID_AUTO, "disable_msix", CTLFLAG_RD, &sc->disable_msix, 0,
1840	    "Disable the use of MSI-X interrupts");
1841
1842	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1843	    OID_AUTO, "max_msix", CTLFLAG_RD, &sc->max_msix, 0,
1844	    "User-defined maximum number of MSIX queues");
1845
1846	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1847	    OID_AUTO, "msix_msgs", CTLFLAG_RD, &sc->msi_msgs, 0,
1848	    "Negotiated number of MSIX queues");
1849
1850	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1851	    OID_AUTO, "max_reqframes", CTLFLAG_RD, &sc->max_reqframes, 0,
1852	    "Total number of allocated request frames");
1853
1854	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1855	    OID_AUTO, "max_prireqframes", CTLFLAG_RD, &sc->max_prireqframes, 0,
1856	    "Total number of allocated high priority request frames");
1857
1858	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1859	    OID_AUTO, "max_replyframes", CTLFLAG_RD, &sc->max_replyframes, 0,
1860	    "Total number of allocated reply frames");
1861
1862	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1863	    OID_AUTO, "max_evtframes", CTLFLAG_RD, &sc->max_evtframes, 0,
1864	    "Total number of event frames allocated");
1865
1866	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1867	    OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version,
1868	    strlen(sc->fw_version), "firmware version");
1869
1870	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1871	    OID_AUTO, "driver_version", CTLFLAG_RD, MPR_DRIVER_VERSION,
1872	    strlen(MPR_DRIVER_VERSION), "driver version");
1873
1874	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1875	    OID_AUTO, "msg_version", CTLFLAG_RD, sc->msg_version,
1876	    strlen(sc->msg_version), "message interface version (deprecated)");
1877
1878	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1879	    OID_AUTO, "io_cmds_active", CTLFLAG_RD,
1880	    &sc->io_cmds_active, 0, "number of currently active commands");
1881
1882	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1883	    OID_AUTO, "io_cmds_highwater", CTLFLAG_RD,
1884	    &sc->io_cmds_highwater, 0, "maximum active commands seen");
1885
1886	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1887	    OID_AUTO, "chain_free", CTLFLAG_RD,
1888	    &sc->chain_free, 0, "number of free chain elements");
1889
1890	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1891	    OID_AUTO, "chain_free_lowwater", CTLFLAG_RD,
1892	    &sc->chain_free_lowwater, 0,"lowest number of free chain elements");
1893
1894	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1895	    OID_AUTO, "max_chains", CTLFLAG_RD,
1896	    &sc->max_chains, 0,"maximum chain frames that will be allocated");
1897
1898	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1899	    OID_AUTO, "max_io_pages", CTLFLAG_RD,
1900	    &sc->max_io_pages, 0,"maximum pages to allow per I/O (if <1 use "
1901	    "IOCFacts)");
1902
1903	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1904	    OID_AUTO, "enable_ssu", CTLFLAG_RW, &sc->enable_ssu, 0,
1905	    "enable SSU to SATA SSD/HDD at shutdown");
1906
1907	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1908	    OID_AUTO, "chain_alloc_fail", CTLFLAG_RD,
1909	    &sc->chain_alloc_fail, "chain allocation failures");
1910
1911	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1912	    OID_AUTO, "spinup_wait_time", CTLFLAG_RD,
1913	    &sc->spinup_wait_time, DEFAULT_SPINUP_WAIT, "seconds to wait for "
1914	    "spinup after SATA ID error");
1915
1916	SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1917	    OID_AUTO, "dump_reqs",
1918	    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
1919	    sc, 0, mpr_dump_reqs, "I", "Dump Active Requests");
1920
1921	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1922	    OID_AUTO, "dump_reqs_alltypes", CTLFLAG_RW,
1923	    &sc->dump_reqs_alltypes, 0,
1924	    "dump all request types not just inqueue");
1925
1926	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1927	    OID_AUTO, "use_phy_num", CTLFLAG_RD, &sc->use_phynum, 0,
1928	    "Use the phy number for enumeration");
1929
1930	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1931	    OID_AUTO, "prp_pages_free", CTLFLAG_RD,
1932	    &sc->prp_pages_free, 0, "number of free PRP pages");
1933
1934	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1935	    OID_AUTO, "prp_pages_free_lowwater", CTLFLAG_RD,
1936	    &sc->prp_pages_free_lowwater, 0,"lowest number of free PRP pages");
1937
1938	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1939	    OID_AUTO, "prp_page_alloc_fail", CTLFLAG_RD,
1940	    &sc->prp_page_alloc_fail, "PRP page allocation failures");
1941}
1942
1943static struct mpr_debug_string {
1944	char *name;
1945	int flag;
1946} mpr_debug_strings[] = {
1947	{"info", MPR_INFO},
1948	{"fault", MPR_FAULT},
1949	{"event", MPR_EVENT},
1950	{"log", MPR_LOG},
1951	{"recovery", MPR_RECOVERY},
1952	{"error", MPR_ERROR},
1953	{"init", MPR_INIT},
1954	{"xinfo", MPR_XINFO},
1955	{"user", MPR_USER},
1956	{"mapping", MPR_MAPPING},
1957	{"trace", MPR_TRACE}
1958};
1959
1960enum mpr_debug_level_combiner {
1961	COMB_NONE,
1962	COMB_ADD,
1963	COMB_SUB
1964};
1965
1966static int
1967mpr_debug_sysctl(SYSCTL_HANDLER_ARGS)
1968{
1969	struct mpr_softc *sc;
1970	struct mpr_debug_string *string;
1971	struct sbuf *sbuf;
1972	char *buffer;
1973	size_t sz;
1974	int i, len, debug, error;
1975
1976	sc = (struct mpr_softc *)arg1;
1977
1978	error = sysctl_wire_old_buffer(req, 0);
1979	if (error != 0)
1980		return (error);
1981
1982	sbuf = sbuf_new_for_sysctl(NULL, NULL, 128, req);
1983	debug = sc->mpr_debug;
1984
1985	sbuf_printf(sbuf, "%#x", debug);
1986
1987	sz = sizeof(mpr_debug_strings) / sizeof(mpr_debug_strings[0]);
1988	for (i = 0; i < sz; i++) {
1989		string = &mpr_debug_strings[i];
1990		if (debug & string->flag)
1991			sbuf_printf(sbuf, ",%s", string->name);
1992	}
1993
1994	error = sbuf_finish(sbuf);
1995	sbuf_delete(sbuf);
1996
1997	if (error || req->newptr == NULL)
1998		return (error);
1999
2000	len = req->newlen - req->newidx;
2001	if (len == 0)
2002		return (0);
2003
2004	buffer = malloc(len, M_MPR, M_ZERO|M_WAITOK);
2005	error = SYSCTL_IN(req, buffer, len);
2006
2007	mpr_parse_debug(sc, buffer);
2008
2009	free(buffer, M_MPR);
2010	return (error);
2011}
2012
2013static void
2014mpr_parse_debug(struct mpr_softc *sc, char *list)
2015{
2016	struct mpr_debug_string *string;
2017	enum mpr_debug_level_combiner op;
2018	char *token, *endtoken;
2019	size_t sz;
2020	int flags, i;
2021
2022	if (list == NULL || *list == '\0')
2023		return;
2024
2025	if (*list == '+') {
2026		op = COMB_ADD;
2027		list++;
2028	} else if (*list == '-') {
2029		op = COMB_SUB;
2030		list++;
2031	} else
2032		op = COMB_NONE;
2033	if (*list == '\0')
2034		return;
2035
2036	flags = 0;
2037	sz = sizeof(mpr_debug_strings) / sizeof(mpr_debug_strings[0]);
2038	while ((token = strsep(&list, ":,")) != NULL) {
2039		/* Handle integer flags */
2040		flags |= strtol(token, &endtoken, 0);
2041		if (token != endtoken)
2042			continue;
2043
2044		/* Handle text flags */
2045		for (i = 0; i < sz; i++) {
2046			string = &mpr_debug_strings[i];
2047			if (strcasecmp(token, string->name) == 0) {
2048				flags |= string->flag;
2049				break;
2050			}
2051		}
2052	}
2053
2054	switch (op) {
2055	case COMB_NONE:
2056		sc->mpr_debug = flags;
2057		break;
2058	case COMB_ADD:
2059		sc->mpr_debug |= flags;
2060		break;
2061	case COMB_SUB:
2062		sc->mpr_debug &= (~flags);
2063		break;
2064	}
2065	return;
2066}
2067
2068struct mpr_dumpreq_hdr {
2069	uint32_t	smid;
2070	uint32_t	state;
2071	uint32_t	numframes;
2072	uint32_t	deschi;
2073	uint32_t	desclo;
2074};
2075
2076static int
2077mpr_dump_reqs(SYSCTL_HANDLER_ARGS)
2078{
2079	struct mpr_softc *sc;
2080	struct mpr_chain *chain, *chain1;
2081	struct mpr_command *cm;
2082	struct mpr_dumpreq_hdr hdr;
2083	struct sbuf *sb;
2084	uint32_t smid, state;
2085	int i, numreqs, error = 0;
2086
2087	sc = (struct mpr_softc *)arg1;
2088
2089	if ((error = priv_check(curthread, PRIV_DRIVER)) != 0) {
2090		printf("priv check error %d\n", error);
2091		return (error);
2092	}
2093
2094	state = MPR_CM_STATE_INQUEUE;
2095	smid = 1;
2096	numreqs = sc->num_reqs;
2097
2098	if (req->newptr != NULL)
2099		return (EINVAL);
2100
2101	if (smid == 0 || smid > sc->num_reqs)
2102		return (EINVAL);
2103	if (numreqs <= 0 || (numreqs + smid > sc->num_reqs))
2104		numreqs = sc->num_reqs;
2105	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
2106
2107	/* Best effort, no locking */
2108	for (i = smid; i < numreqs; i++) {
2109		cm = &sc->commands[i];
2110		if ((sc->dump_reqs_alltypes == 0) && (cm->cm_state != state))
2111			continue;
2112		hdr.smid = i;
2113		hdr.state = cm->cm_state;
2114		hdr.numframes = 1;
2115		hdr.deschi = cm->cm_desc.Words.High;
2116		hdr.desclo = cm->cm_desc.Words.Low;
2117		TAILQ_FOREACH_SAFE(chain, &cm->cm_chain_list, chain_link,
2118		   chain1)
2119			hdr.numframes++;
2120		sbuf_bcat(sb, &hdr, sizeof(hdr));
2121		sbuf_bcat(sb, cm->cm_req, 128);
2122		TAILQ_FOREACH_SAFE(chain, &cm->cm_chain_list, chain_link,
2123		    chain1)
2124			sbuf_bcat(sb, chain->chain, 128);
2125	}
2126
2127	error = sbuf_finish(sb);
2128	sbuf_delete(sb);
2129	return (error);
2130}
2131
2132int
2133mpr_attach(struct mpr_softc *sc)
2134{
2135	int error;
2136
2137	MPR_FUNCTRACE(sc);
2138	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
2139
2140	mtx_init(&sc->mpr_mtx, "MPR lock", NULL, MTX_DEF);
2141	callout_init_mtx(&sc->periodic, &sc->mpr_mtx, 0);
2142	callout_init_mtx(&sc->device_check_callout, &sc->mpr_mtx, 0);
2143	TAILQ_INIT(&sc->event_list);
2144	timevalclear(&sc->lastfail);
2145
2146	if ((error = mpr_transition_ready(sc)) != 0) {
2147		mpr_dprint(sc, MPR_INIT|MPR_FAULT,
2148		    "Failed to transition ready\n");
2149		return (error);
2150	}
2151
2152	sc->facts = malloc(sizeof(MPI2_IOC_FACTS_REPLY), M_MPR,
2153	    M_ZERO|M_NOWAIT);
2154	if (!sc->facts) {
2155		mpr_dprint(sc, MPR_INIT|MPR_FAULT,
2156		    "Cannot allocate memory, exit\n");
2157		return (ENOMEM);
2158	}
2159
2160	/*
2161	 * Get IOC Facts and allocate all structures based on this information.
2162	 * A Diag Reset will also call mpr_iocfacts_allocate and re-read the IOC
2163	 * Facts. If relevant values have changed in IOC Facts, this function
2164	 * will free all of the memory based on IOC Facts and reallocate that
2165	 * memory.  If this fails, any allocated memory should already be freed.
2166	 */
2167	if ((error = mpr_iocfacts_allocate(sc, TRUE)) != 0) {
2168		mpr_dprint(sc, MPR_INIT|MPR_FAULT, "IOC Facts allocation "
2169		    "failed with error %d\n", error);
2170		return (error);
2171	}
2172
2173	/* Start the periodic watchdog check on the IOC Doorbell */
2174	mpr_periodic(sc);
2175
2176	/*
2177	 * The portenable will kick off discovery events that will drive the
2178	 * rest of the initialization process.  The CAM/SAS module will
2179	 * hold up the boot sequence until discovery is complete.
2180	 */
2181	sc->mpr_ich.ich_func = mpr_startup;
2182	sc->mpr_ich.ich_arg = sc;
2183	if (config_intrhook_establish(&sc->mpr_ich) != 0) {
2184		mpr_dprint(sc, MPR_INIT|MPR_ERROR,
2185		    "Cannot establish MPR config hook\n");
2186		error = EINVAL;
2187	}
2188
2189	/*
2190	 * Allow IR to shutdown gracefully when shutdown occurs.
2191	 */
2192	sc->shutdown_eh = EVENTHANDLER_REGISTER(shutdown_final,
2193	    mprsas_ir_shutdown, sc, SHUTDOWN_PRI_DEFAULT);
2194
2195	if (sc->shutdown_eh == NULL)
2196		mpr_dprint(sc, MPR_INIT|MPR_ERROR,
2197		    "shutdown event registration failed\n");
2198
2199	mpr_setup_sysctl(sc);
2200
2201	sc->mpr_flags |= MPR_FLAGS_ATTACH_DONE;
2202	mpr_dprint(sc, MPR_INIT, "%s exit error= %d\n", __func__, error);
2203
2204	return (error);
2205}
2206
2207/* Run through any late-start handlers. */
2208static void
2209mpr_startup(void *arg)
2210{
2211	struct mpr_softc *sc;
2212
2213	sc = (struct mpr_softc *)arg;
2214	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
2215
2216	mpr_lock(sc);
2217	mpr_unmask_intr(sc);
2218
2219	/* initialize device mapping tables */
2220	mpr_base_static_config_pages(sc);
2221	mpr_mapping_initialize(sc);
2222	mprsas_startup(sc);
2223	mpr_unlock(sc);
2224
2225	mpr_dprint(sc, MPR_INIT, "disestablish config intrhook\n");
2226	config_intrhook_disestablish(&sc->mpr_ich);
2227	sc->mpr_ich.ich_arg = NULL;
2228
2229	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
2230}
2231
2232/* Periodic watchdog.  Is called with the driver lock already held. */
2233static void
2234mpr_periodic(void *arg)
2235{
2236	struct mpr_softc *sc;
2237	uint32_t db;
2238
2239	sc = (struct mpr_softc *)arg;
2240	if (sc->mpr_flags & MPR_FLAGS_SHUTDOWN)
2241		return;
2242
2243	db = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
2244	if ((db & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
2245		if ((db & MPI2_DOORBELL_FAULT_CODE_MASK) ==
2246		    IFAULT_IOP_OVER_TEMP_THRESHOLD_EXCEEDED) {
2247			panic("TEMPERATURE FAULT: STOPPING.");
2248		}
2249		mpr_dprint(sc, MPR_FAULT, "IOC Fault 0x%08x, Resetting\n", db);
2250		mpr_reinit(sc);
2251	}
2252
2253	callout_reset_sbt(&sc->periodic, MPR_PERIODIC_DELAY * SBT_1S, 0,
2254	    mpr_periodic, sc, C_PREL(1));
2255}
2256
2257static void
2258mpr_log_evt_handler(struct mpr_softc *sc, uintptr_t data,
2259    MPI2_EVENT_NOTIFICATION_REPLY *event)
2260{
2261	MPI2_EVENT_DATA_LOG_ENTRY_ADDED *entry;
2262
2263	MPR_DPRINT_EVENT(sc, generic, event);
2264
2265	switch (event->Event) {
2266	case MPI2_EVENT_LOG_DATA:
2267		mpr_dprint(sc, MPR_EVENT, "MPI2_EVENT_LOG_DATA:\n");
2268		if (sc->mpr_debug & MPR_EVENT)
2269			hexdump(event->EventData, event->EventDataLength, NULL,
2270			    0);
2271		break;
2272	case MPI2_EVENT_LOG_ENTRY_ADDED:
2273		entry = (MPI2_EVENT_DATA_LOG_ENTRY_ADDED *)event->EventData;
2274		mpr_dprint(sc, MPR_EVENT, "MPI2_EVENT_LOG_ENTRY_ADDED event "
2275		    "0x%x Sequence %d:\n", entry->LogEntryQualifier,
2276		     entry->LogSequence);
2277		break;
2278	default:
2279		break;
2280	}
2281	return;
2282}
2283
2284static int
2285mpr_attach_log(struct mpr_softc *sc)
2286{
2287	uint8_t events[16];
2288
2289	bzero(events, 16);
2290	setbit(events, MPI2_EVENT_LOG_DATA);
2291	setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED);
2292
2293	mpr_register_events(sc, events, mpr_log_evt_handler, NULL,
2294	    &sc->mpr_log_eh);
2295
2296	return (0);
2297}
2298
2299static int
2300mpr_detach_log(struct mpr_softc *sc)
2301{
2302
2303	if (sc->mpr_log_eh != NULL)
2304		mpr_deregister_events(sc, sc->mpr_log_eh);
2305	return (0);
2306}
2307
2308/*
2309 * Free all of the driver resources and detach submodules.  Should be called
2310 * without the lock held.
2311 */
2312int
2313mpr_free(struct mpr_softc *sc)
2314{
2315	int error;
2316
2317	mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
2318	/* Turn off the watchdog */
2319	mpr_lock(sc);
2320	sc->mpr_flags |= MPR_FLAGS_SHUTDOWN;
2321	mpr_unlock(sc);
2322	/* Lock must not be held for this */
2323	callout_drain(&sc->periodic);
2324	callout_drain(&sc->device_check_callout);
2325
2326	if (((error = mpr_detach_log(sc)) != 0) ||
2327	    ((error = mpr_detach_sas(sc)) != 0)) {
2328		mpr_dprint(sc, MPR_INIT|MPR_FAULT, "failed to detach "
2329		    "subsystems, error= %d, exit\n", error);
2330		return (error);
2331	}
2332
2333	mpr_detach_user(sc);
2334
2335	/* Put the IOC back in the READY state. */
2336	mpr_lock(sc);
2337	if ((error = mpr_transition_ready(sc)) != 0) {
2338		mpr_unlock(sc);
2339		return (error);
2340	}
2341	mpr_unlock(sc);
2342
2343	if (sc->facts != NULL)
2344		free(sc->facts, M_MPR);
2345
2346	/*
2347	 * Free all buffers that are based on IOC Facts.  A Diag Reset may need
2348	 * to free these buffers too.
2349	 */
2350	mpr_iocfacts_free(sc);
2351
2352	if (sc->sysctl_tree != NULL)
2353		sysctl_ctx_free(&sc->sysctl_ctx);
2354
2355	/* Deregister the shutdown function */
2356	if (sc->shutdown_eh != NULL)
2357		EVENTHANDLER_DEREGISTER(shutdown_final, sc->shutdown_eh);
2358
2359	mtx_destroy(&sc->mpr_mtx);
2360	mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
2361
2362	return (0);
2363}
2364
2365static __inline void
2366mpr_complete_command(struct mpr_softc *sc, struct mpr_command *cm)
2367{
2368	MPR_FUNCTRACE(sc);
2369
2370	if (cm == NULL) {
2371		mpr_dprint(sc, MPR_ERROR, "Completing NULL command\n");
2372		return;
2373	}
2374
2375	KASSERT(cm->cm_state == MPR_CM_STATE_INQUEUE,
2376	    ("command not inqueue, state = %u\n", cm->cm_state));
2377	cm->cm_state = MPR_CM_STATE_BUSY;
2378	if (cm->cm_flags & MPR_CM_FLAGS_POLLED)
2379		cm->cm_flags |= MPR_CM_FLAGS_COMPLETE;
2380
2381	if (cm->cm_complete != NULL) {
2382		mpr_dprint(sc, MPR_TRACE,
2383		    "%s cm %p calling cm_complete %p data %p reply %p\n",
2384		    __func__, cm, cm->cm_complete, cm->cm_complete_data,
2385		    cm->cm_reply);
2386		cm->cm_complete(sc, cm);
2387	}
2388
2389	if (cm->cm_flags & MPR_CM_FLAGS_WAKEUP) {
2390		mpr_dprint(sc, MPR_TRACE, "waking up %p\n", cm);
2391		wakeup(cm);
2392	}
2393
2394	if (sc->io_cmds_active != 0) {
2395		sc->io_cmds_active--;
2396	} else {
2397		mpr_dprint(sc, MPR_ERROR, "Warning: io_cmds_active is "
2398		    "out of sync - resynching to 0\n");
2399	}
2400}
2401
2402static void
2403mpr_sas_log_info(struct mpr_softc *sc , u32 log_info)
2404{
2405	union loginfo_type {
2406		u32	loginfo;
2407		struct {
2408			u32	subcode:16;
2409			u32	code:8;
2410			u32	originator:4;
2411			u32	bus_type:4;
2412		} dw;
2413	};
2414	union loginfo_type sas_loginfo;
2415	char *originator_str = NULL;
2416
2417	sas_loginfo.loginfo = log_info;
2418	if (sas_loginfo.dw.bus_type != 3 /*SAS*/)
2419		return;
2420
2421	/* each nexus loss loginfo */
2422	if (log_info == 0x31170000)
2423		return;
2424
2425	/* eat the loginfos associated with task aborts */
2426	if ((log_info == 30050000) || (log_info == 0x31140000) ||
2427	    (log_info == 0x31130000))
2428		return;
2429
2430	switch (sas_loginfo.dw.originator) {
2431	case 0:
2432		originator_str = "IOP";
2433		break;
2434	case 1:
2435		originator_str = "PL";
2436		break;
2437	case 2:
2438		originator_str = "IR";
2439		break;
2440	}
2441
2442	mpr_dprint(sc, MPR_LOG, "log_info(0x%08x): originator(%s), "
2443	    "code(0x%02x), sub_code(0x%04x)\n", log_info, originator_str,
2444	    sas_loginfo.dw.code, sas_loginfo.dw.subcode);
2445}
2446
2447static void
2448mpr_display_reply_info(struct mpr_softc *sc, uint8_t *reply)
2449{
2450	MPI2DefaultReply_t *mpi_reply;
2451	u16 sc_status;
2452
2453	mpi_reply = (MPI2DefaultReply_t*)reply;
2454	sc_status = le16toh(mpi_reply->IOCStatus);
2455	if (sc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE)
2456		mpr_sas_log_info(sc, le32toh(mpi_reply->IOCLogInfo));
2457}
2458
2459void
2460mpr_intr(void *data)
2461{
2462	struct mpr_softc *sc;
2463	uint32_t status;
2464
2465	sc = (struct mpr_softc *)data;
2466	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2467
2468	/*
2469	 * Check interrupt status register to flush the bus.  This is
2470	 * needed for both INTx interrupts and driver-driven polling
2471	 */
2472	status = mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
2473	if ((status & MPI2_HIS_REPLY_DESCRIPTOR_INTERRUPT) == 0)
2474		return;
2475
2476	mpr_lock(sc);
2477	mpr_intr_locked(data);
2478	mpr_unlock(sc);
2479	return;
2480}
2481
2482/*
2483 * In theory, MSI/MSIX interrupts shouldn't need to read any registers on the
2484 * chip.  Hopefully this theory is correct.
2485 */
2486void
2487mpr_intr_msi(void *data)
2488{
2489	struct mpr_softc *sc;
2490
2491	sc = (struct mpr_softc *)data;
2492	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2493	mpr_lock(sc);
2494	mpr_intr_locked(data);
2495	mpr_unlock(sc);
2496	return;
2497}
2498
2499/*
2500 * The locking is overly broad and simplistic, but easy to deal with for now.
2501 */
2502void
2503mpr_intr_locked(void *data)
2504{
2505	MPI2_REPLY_DESCRIPTORS_UNION *desc;
2506	MPI2_DIAG_RELEASE_REPLY *rel_rep;
2507	mpr_fw_diagnostic_buffer_t *pBuffer;
2508	struct mpr_softc *sc;
2509	uint64_t tdesc;
2510	struct mpr_command *cm = NULL;
2511	uint8_t flags;
2512	u_int pq;
2513
2514	sc = (struct mpr_softc *)data;
2515
2516	pq = sc->replypostindex;
2517	mpr_dprint(sc, MPR_TRACE,
2518	    "%s sc %p starting with replypostindex %u\n",
2519	    __func__, sc, sc->replypostindex);
2520
2521	for ( ;; ) {
2522		cm = NULL;
2523		desc = &sc->post_queue[sc->replypostindex];
2524
2525		/*
2526		 * Copy and clear out the descriptor so that any reentry will
2527		 * immediately know that this descriptor has already been
2528		 * looked at.  There is unfortunate casting magic because the
2529		 * MPI API doesn't have a cardinal 64bit type.
2530		 */
2531		tdesc = 0xffffffffffffffff;
2532		tdesc = atomic_swap_64((uint64_t *)desc, tdesc);
2533		desc = (MPI2_REPLY_DESCRIPTORS_UNION *)&tdesc;
2534
2535		flags = desc->Default.ReplyFlags &
2536		    MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
2537		if ((flags == MPI2_RPY_DESCRIPT_FLAGS_UNUSED) ||
2538		    (le32toh(desc->Words.High) == 0xffffffff))
2539			break;
2540
2541		/* increment the replypostindex now, so that event handlers
2542		 * and cm completion handlers which decide to do a diag
2543		 * reset can zero it without it getting incremented again
2544		 * afterwards, and we break out of this loop on the next
2545		 * iteration since the reply post queue has been cleared to
2546		 * 0xFF and all descriptors look unused (which they are).
2547		 */
2548		if (++sc->replypostindex >= sc->pqdepth)
2549			sc->replypostindex = 0;
2550
2551		switch (flags) {
2552		case MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS:
2553		case MPI25_RPY_DESCRIPT_FLAGS_FAST_PATH_SCSI_IO_SUCCESS:
2554		case MPI26_RPY_DESCRIPT_FLAGS_PCIE_ENCAPSULATED_SUCCESS:
2555			cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)];
2556			cm->cm_reply = NULL;
2557			break;
2558		case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY:
2559		{
2560			uint32_t baddr;
2561			uint8_t *reply;
2562
2563			/*
2564			 * Re-compose the reply address from the address
2565			 * sent back from the chip.  The ReplyFrameAddress
2566			 * is the lower 32 bits of the physical address of
2567			 * particular reply frame.  Convert that address to
2568			 * host format, and then use that to provide the
2569			 * offset against the virtual address base
2570			 * (sc->reply_frames).
2571			 */
2572			baddr = le32toh(desc->AddressReply.ReplyFrameAddress);
2573			reply = sc->reply_frames +
2574				(baddr - ((uint32_t)sc->reply_busaddr));
2575			/*
2576			 * Make sure the reply we got back is in a valid
2577			 * range.  If not, go ahead and panic here, since
2578			 * we'll probably panic as soon as we deference the
2579			 * reply pointer anyway.
2580			 */
2581			if ((reply < sc->reply_frames)
2582			 || (reply > (sc->reply_frames +
2583			     (sc->fqdepth * sc->replyframesz)))) {
2584				printf("%s: WARNING: reply %p out of range!\n",
2585				       __func__, reply);
2586				printf("%s: reply_frames %p, fqdepth %d, "
2587				       "frame size %d\n", __func__,
2588				       sc->reply_frames, sc->fqdepth,
2589				       sc->replyframesz);
2590				printf("%s: baddr %#x,\n", __func__, baddr);
2591				/* LSI-TODO. See Linux Code for Graceful exit */
2592				panic("Reply address out of range");
2593			}
2594			if (le16toh(desc->AddressReply.SMID) == 0) {
2595				if (((MPI2_DEFAULT_REPLY *)reply)->Function ==
2596				    MPI2_FUNCTION_DIAG_BUFFER_POST) {
2597					/*
2598					 * If SMID is 0 for Diag Buffer Post,
2599					 * this implies that the reply is due to
2600					 * a release function with a status that
2601					 * the buffer has been released.  Set
2602					 * the buffer flags accordingly.
2603					 */
2604					rel_rep =
2605					    (MPI2_DIAG_RELEASE_REPLY *)reply;
2606					if ((le16toh(rel_rep->IOCStatus) &
2607					    MPI2_IOCSTATUS_MASK) ==
2608					    MPI2_IOCSTATUS_DIAGNOSTIC_RELEASED)
2609					{
2610						pBuffer =
2611						    &sc->fw_diag_buffer_list[
2612						    rel_rep->BufferType];
2613						pBuffer->valid_data = TRUE;
2614						pBuffer->owned_by_firmware =
2615						    FALSE;
2616						pBuffer->immediate = FALSE;
2617					}
2618				} else
2619					mpr_dispatch_event(sc, baddr,
2620					    (MPI2_EVENT_NOTIFICATION_REPLY *)
2621					    reply);
2622			} else {
2623				cm = &sc->commands[
2624				    le16toh(desc->AddressReply.SMID)];
2625				if (cm->cm_state == MPR_CM_STATE_INQUEUE) {
2626					cm->cm_reply = reply;
2627					cm->cm_reply_data =
2628					    le32toh(desc->AddressReply.
2629						ReplyFrameAddress);
2630				} else {
2631					mpr_dprint(sc, MPR_RECOVERY,
2632					    "Bad state for ADDRESS_REPLY status,"
2633					    " ignoring state %d cm %p\n",
2634					    cm->cm_state, cm);
2635				}
2636			}
2637			break;
2638		}
2639		case MPI2_RPY_DESCRIPT_FLAGS_TARGETASSIST_SUCCESS:
2640		case MPI2_RPY_DESCRIPT_FLAGS_TARGET_COMMAND_BUFFER:
2641		case MPI2_RPY_DESCRIPT_FLAGS_RAID_ACCELERATOR_SUCCESS:
2642		default:
2643			/* Unhandled */
2644			mpr_dprint(sc, MPR_ERROR, "Unhandled reply 0x%x\n",
2645			    desc->Default.ReplyFlags);
2646			cm = NULL;
2647			break;
2648		}
2649
2650		if (cm != NULL) {
2651			// Print Error reply frame
2652			if (cm->cm_reply)
2653				mpr_display_reply_info(sc,cm->cm_reply);
2654			mpr_complete_command(sc, cm);
2655		}
2656	}
2657
2658	if (pq != sc->replypostindex) {
2659		mpr_dprint(sc, MPR_TRACE, "%s sc %p writing postindex %d\n",
2660		    __func__, sc, sc->replypostindex);
2661		mpr_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET,
2662		    sc->replypostindex);
2663	}
2664
2665	return;
2666}
2667
2668static void
2669mpr_dispatch_event(struct mpr_softc *sc, uintptr_t data,
2670    MPI2_EVENT_NOTIFICATION_REPLY *reply)
2671{
2672	struct mpr_event_handle *eh;
2673	int event, handled = 0;
2674
2675	event = le16toh(reply->Event);
2676	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
2677		if (isset(eh->mask, event)) {
2678			eh->callback(sc, data, reply);
2679			handled++;
2680		}
2681	}
2682
2683	if (handled == 0)
2684		mpr_dprint(sc, MPR_EVENT, "Unhandled event 0x%x\n",
2685		    le16toh(event));
2686
2687	/*
2688	 * This is the only place that the event/reply should be freed.
2689	 * Anything wanting to hold onto the event data should have
2690	 * already copied it into their own storage.
2691	 */
2692	mpr_free_reply(sc, data);
2693}
2694
2695static void
2696mpr_reregister_events_complete(struct mpr_softc *sc, struct mpr_command *cm)
2697{
2698	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2699
2700	if (cm->cm_reply)
2701		MPR_DPRINT_EVENT(sc, generic,
2702			(MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply);
2703
2704	mpr_free_command(sc, cm);
2705
2706	/* next, send a port enable */
2707	mprsas_startup(sc);
2708}
2709
2710/*
2711 * For both register_events and update_events, the caller supplies a bitmap
2712 * of events that it _wants_.  These functions then turn that into a bitmask
2713 * suitable for the controller.
2714 */
2715int
2716mpr_register_events(struct mpr_softc *sc, uint8_t *mask,
2717    mpr_evt_callback_t *cb, void *data, struct mpr_event_handle **handle)
2718{
2719	struct mpr_event_handle *eh;
2720	int error = 0;
2721
2722	eh = malloc(sizeof(struct mpr_event_handle), M_MPR, M_WAITOK|M_ZERO);
2723	eh->callback = cb;
2724	eh->data = data;
2725	TAILQ_INSERT_TAIL(&sc->event_list, eh, eh_list);
2726	if (mask != NULL)
2727		error = mpr_update_events(sc, eh, mask);
2728	*handle = eh;
2729
2730	return (error);
2731}
2732
2733int
2734mpr_update_events(struct mpr_softc *sc, struct mpr_event_handle *handle,
2735    uint8_t *mask)
2736{
2737	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
2738	MPI2_EVENT_NOTIFICATION_REPLY *reply = NULL;
2739	struct mpr_command *cm = NULL;
2740	struct mpr_event_handle *eh;
2741	int error, i;
2742
2743	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2744
2745	if ((mask != NULL) && (handle != NULL))
2746		bcopy(mask, &handle->mask[0], 16);
2747	memset(sc->event_mask, 0xff, 16);
2748
2749	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
2750		for (i = 0; i < 16; i++)
2751			sc->event_mask[i] &= ~eh->mask[i];
2752	}
2753
2754	if ((cm = mpr_alloc_command(sc)) == NULL)
2755		return (EBUSY);
2756	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
2757	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
2758	evtreq->MsgFlags = 0;
2759	evtreq->SASBroadcastPrimitiveMasks = 0;
2760#ifdef MPR_DEBUG_ALL_EVENTS
2761	{
2762		u_char fullmask[sizeof(evtreq->EventMasks)];
2763		memset(fullmask, 0x00, sizeof(fullmask));
2764		bcopy(fullmask, (uint8_t *)&evtreq->EventMasks, sizeof(fullmask));
2765	}
2766#else
2767	bcopy(sc->event_mask, (uint8_t *)&evtreq->EventMasks, sizeof(sc->event_mask));
2768	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
2769		evtreq->EventMasks[i] = htole32(evtreq->EventMasks[i]);
2770#endif
2771	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
2772	cm->cm_data = NULL;
2773
2774	error = mpr_request_polled(sc, &cm);
2775	if (cm != NULL)
2776		reply = (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply;
2777	if ((reply == NULL) ||
2778	    (reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
2779		error = ENXIO;
2780
2781	if (reply)
2782		MPR_DPRINT_EVENT(sc, generic, reply);
2783
2784	mpr_dprint(sc, MPR_TRACE, "%s finished error %d\n", __func__, error);
2785
2786	if (cm != NULL)
2787		mpr_free_command(sc, cm);
2788	return (error);
2789}
2790
2791static int
2792mpr_reregister_events(struct mpr_softc *sc)
2793{
2794	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
2795	struct mpr_command *cm;
2796	struct mpr_event_handle *eh;
2797	int error, i;
2798
2799	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2800
2801	/* first, reregister events */
2802
2803	memset(sc->event_mask, 0xff, 16);
2804
2805	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
2806		for (i = 0; i < 16; i++)
2807			sc->event_mask[i] &= ~eh->mask[i];
2808	}
2809
2810	if ((cm = mpr_alloc_command(sc)) == NULL)
2811		return (EBUSY);
2812	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
2813	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
2814	evtreq->MsgFlags = 0;
2815	evtreq->SASBroadcastPrimitiveMasks = 0;
2816#ifdef MPR_DEBUG_ALL_EVENTS
2817	{
2818		u_char fullmask[sizeof(evtreq->EventMasks)];
2819		memset(fullmask, 0x00, sizeof(fullmask));
2820		bcopy(fullmask, (uint8_t *)&evtreq->EventMasks, sizeof(fullmask));
2821	}
2822#else
2823	bcopy(sc->event_mask, (uint8_t *)&evtreq->EventMasks, sizeof(sc->event_mask));
2824	for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
2825		evtreq->EventMasks[i] = htole32(evtreq->EventMasks[i]);
2826#endif
2827	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
2828	cm->cm_data = NULL;
2829	cm->cm_complete = mpr_reregister_events_complete;
2830
2831	error = mpr_map_command(sc, cm);
2832
2833	mpr_dprint(sc, MPR_TRACE, "%s finished with error %d\n", __func__,
2834	    error);
2835	return (error);
2836}
2837
2838int
2839mpr_deregister_events(struct mpr_softc *sc, struct mpr_event_handle *handle)
2840{
2841
2842	TAILQ_REMOVE(&sc->event_list, handle, eh_list);
2843	free(handle, M_MPR);
2844	return (mpr_update_events(sc, NULL, NULL));
2845}
2846
2847/**
2848* mpr_build_nvme_prp - This function is called for NVMe end devices to build a
2849* native SGL (NVMe PRP). The native SGL is built starting in the first PRP entry
2850* of the NVMe message (PRP1). If the data buffer is small enough to be described
2851* entirely using PRP1, then PRP2 is not used. If needed, PRP2 is used to
2852* describe a larger data buffer. If the data buffer is too large to describe
2853* using the two PRP entriess inside the NVMe message, then PRP1 describes the
2854* first data memory segment, and PRP2 contains a pointer to a PRP list located
2855* elsewhere in memory to describe the remaining data memory segments. The PRP
2856* list will be contiguous.
2857
2858* The native SGL for NVMe devices is a Physical Region Page (PRP). A PRP
2859* consists of a list of PRP entries to describe a number of noncontigous
2860* physical memory segments as a single memory buffer, just as a SGL does. Note
2861* however, that this function is only used by the IOCTL call, so the memory
2862* given will be guaranteed to be contiguous. There is no need to translate
2863* non-contiguous SGL into a PRP in this case. All PRPs will describe contiguous
2864* space that is one page size each.
2865*
2866* Each NVMe message contains two PRP entries. The first (PRP1) either contains
2867* a PRP list pointer or a PRP element, depending upon the command. PRP2 contains
2868* the second PRP element if the memory being described fits within 2 PRP
2869* entries, or a PRP list pointer if the PRP spans more than two entries.
2870*
2871* A PRP list pointer contains the address of a PRP list, structured as a linear
2872* array of PRP entries. Each PRP entry in this list describes a segment of
2873* physical memory.
2874*
2875* Each 64-bit PRP entry comprises an address and an offset field. The address
2876* always points to the beginning of a PAGE_SIZE physical memory page, and the
2877* offset describes where within that page the memory segment begins. Only the
2878* first element in a PRP list may contain a non-zero offest, implying that all
2879* memory segments following the first begin at the start of a PAGE_SIZE page.
2880*
2881* Each PRP element normally describes a chunck of PAGE_SIZE physical memory,
2882* with exceptions for the first and last elements in the list. If the memory
2883* being described by the list begins at a non-zero offset within the first page,
2884* then the first PRP element will contain a non-zero offset indicating where the
2885* region begins within the page. The last memory segment may end before the end
2886* of the PAGE_SIZE segment, depending upon the overall size of the memory being
2887* described by the PRP list.
2888*
2889* Since PRP entries lack any indication of size, the overall data buffer length
2890* is used to determine where the end of the data memory buffer is located, and
2891* how many PRP entries are required to describe it.
2892*
2893* Returns nothing.
2894*/
2895void
2896mpr_build_nvme_prp(struct mpr_softc *sc, struct mpr_command *cm,
2897    Mpi26NVMeEncapsulatedRequest_t *nvme_encap_request, void *data,
2898    uint32_t data_in_sz, uint32_t data_out_sz)
2899{
2900	int			prp_size = PRP_ENTRY_SIZE;
2901	uint64_t		*prp_entry, *prp1_entry, *prp2_entry;
2902	uint64_t		*prp_entry_phys, *prp_page, *prp_page_phys;
2903	uint32_t		offset, entry_len, page_mask_result, page_mask;
2904	bus_addr_t		paddr;
2905	size_t			length;
2906	struct mpr_prp_page	*prp_page_info = NULL;
2907
2908	/*
2909	 * Not all commands require a data transfer. If no data, just return
2910	 * without constructing any PRP.
2911	 */
2912	if (!data_in_sz && !data_out_sz)
2913		return;
2914
2915	/*
2916	 * Set pointers to PRP1 and PRP2, which are in the NVMe command. PRP1 is
2917	 * located at a 24 byte offset from the start of the NVMe command. Then
2918	 * set the current PRP entry pointer to PRP1.
2919	 */
2920	prp1_entry = (uint64_t *)(nvme_encap_request->NVMe_Command +
2921	    NVME_CMD_PRP1_OFFSET);
2922	prp2_entry = (uint64_t *)(nvme_encap_request->NVMe_Command +
2923	    NVME_CMD_PRP2_OFFSET);
2924	prp_entry = prp1_entry;
2925
2926	/*
2927	 * For the PRP entries, use the specially allocated buffer of
2928	 * contiguous memory. PRP Page allocation failures should not happen
2929	 * because there should be enough PRP page buffers to account for the
2930	 * possible NVMe QDepth.
2931	 */
2932	prp_page_info = mpr_alloc_prp_page(sc);
2933	KASSERT(prp_page_info != NULL, ("%s: There are no PRP Pages left to be "
2934	    "used for building a native NVMe SGL.\n", __func__));
2935	prp_page = (uint64_t *)prp_page_info->prp_page;
2936	prp_page_phys = (uint64_t *)(uintptr_t)prp_page_info->prp_page_busaddr;
2937
2938	/*
2939	 * Insert the allocated PRP page into the command's PRP page list. This
2940	 * will be freed when the command is freed.
2941	 */
2942	TAILQ_INSERT_TAIL(&cm->cm_prp_page_list, prp_page_info, prp_page_link);
2943
2944	/*
2945	 * Check if we are within 1 entry of a page boundary we don't want our
2946	 * first entry to be a PRP List entry.
2947	 */
2948	page_mask = PAGE_SIZE - 1;
2949	page_mask_result = (uintptr_t)((uint8_t *)prp_page + prp_size) &
2950	    page_mask;
2951	if (!page_mask_result)
2952	{
2953		/* Bump up to next page boundary. */
2954		prp_page = (uint64_t *)((uint8_t *)prp_page + prp_size);
2955		prp_page_phys = (uint64_t *)((uint8_t *)prp_page_phys +
2956		    prp_size);
2957	}
2958
2959	/*
2960	 * Set PRP physical pointer, which initially points to the current PRP
2961	 * DMA memory page.
2962	 */
2963	prp_entry_phys = prp_page_phys;
2964
2965	/* Get physical address and length of the data buffer. */
2966	paddr = (bus_addr_t)(uintptr_t)data;
2967	if (data_in_sz)
2968		length = data_in_sz;
2969	else
2970		length = data_out_sz;
2971
2972	/* Loop while the length is not zero. */
2973	while (length)
2974	{
2975		/*
2976		 * Check if we need to put a list pointer here if we are at page
2977		 * boundary - prp_size (8 bytes).
2978		 */
2979		page_mask_result = (uintptr_t)((uint8_t *)prp_entry_phys +
2980		    prp_size) & page_mask;
2981		if (!page_mask_result)
2982		{
2983			/*
2984			 * This is the last entry in a PRP List, so we need to
2985			 * put a PRP list pointer here. What this does is:
2986			 *   - bump the current memory pointer to the next
2987			 *     address, which will be the next full page.
2988			 *   - set the PRP Entry to point to that page. This is
2989			 *     now the PRP List pointer.
2990			 *   - bump the PRP Entry pointer the start of the next
2991			 *     page. Since all of this PRP memory is contiguous,
2992			 *     no need to get a new page - it's just the next
2993			 *     address.
2994			 */
2995			prp_entry_phys++;
2996			*prp_entry =
2997			    htole64((uint64_t)(uintptr_t)prp_entry_phys);
2998			prp_entry++;
2999		}
3000
3001		/* Need to handle if entry will be part of a page. */
3002		offset = (uint32_t)paddr & page_mask;
3003		entry_len = PAGE_SIZE - offset;
3004
3005		if (prp_entry == prp1_entry)
3006		{
3007			/*
3008			 * Must fill in the first PRP pointer (PRP1) before
3009			 * moving on.
3010			 */
3011			*prp1_entry = htole64((uint64_t)paddr);
3012
3013			/*
3014			 * Now point to the second PRP entry within the
3015			 * command (PRP2).
3016			 */
3017			prp_entry = prp2_entry;
3018		}
3019		else if (prp_entry == prp2_entry)
3020		{
3021			/*
3022			 * Should the PRP2 entry be a PRP List pointer or just a
3023			 * regular PRP pointer? If there is more than one more
3024			 * page of data, must use a PRP List pointer.
3025			 */
3026			if (length > PAGE_SIZE)
3027			{
3028				/*
3029				 * PRP2 will contain a PRP List pointer because
3030				 * more PRP's are needed with this command. The
3031				 * list will start at the beginning of the
3032				 * contiguous buffer.
3033				 */
3034				*prp2_entry =
3035				    htole64(
3036				    (uint64_t)(uintptr_t)prp_entry_phys);
3037
3038				/*
3039				 * The next PRP Entry will be the start of the
3040				 * first PRP List.
3041				 */
3042				prp_entry = prp_page;
3043			}
3044			else
3045			{
3046				/*
3047				 * After this, the PRP Entries are complete.
3048				 * This command uses 2 PRP's and no PRP list.
3049				 */
3050				*prp2_entry = htole64((uint64_t)paddr);
3051			}
3052		}
3053		else
3054		{
3055			/*
3056			 * Put entry in list and bump the addresses.
3057			 *
3058			 * After PRP1 and PRP2 are filled in, this will fill in
3059			 * all remaining PRP entries in a PRP List, one per each
3060			 * time through the loop.
3061			 */
3062			*prp_entry = htole64((uint64_t)paddr);
3063			prp_entry++;
3064			prp_entry_phys++;
3065		}
3066
3067		/*
3068		 * Bump the phys address of the command's data buffer by the
3069		 * entry_len.
3070		 */
3071		paddr += entry_len;
3072
3073		/* Decrement length accounting for last partial page. */
3074		if (entry_len > length)
3075			length = 0;
3076		else
3077			length -= entry_len;
3078	}
3079}
3080
3081/*
3082 * mpr_check_pcie_native_sgl - This function is called for PCIe end devices to
3083 * determine if the driver needs to build a native SGL. If so, that native SGL
3084 * is built in the contiguous buffers allocated especially for PCIe SGL
3085 * creation. If the driver will not build a native SGL, return TRUE and a
3086 * normal IEEE SGL will be built. Currently this routine supports NVMe devices
3087 * only.
3088 *
3089 * Returns FALSE (0) if native SGL was built, TRUE (1) if no SGL was built.
3090 */
3091static int
3092mpr_check_pcie_native_sgl(struct mpr_softc *sc, struct mpr_command *cm,
3093    bus_dma_segment_t *segs, int segs_left)
3094{
3095	uint32_t		i, sge_dwords, length, offset, entry_len;
3096	uint32_t		num_entries, buff_len = 0, sges_in_segment;
3097	uint32_t		page_mask, page_mask_result, *curr_buff;
3098	uint32_t		*ptr_sgl, *ptr_first_sgl, first_page_offset;
3099	uint32_t		first_page_data_size, end_residual;
3100	uint64_t		*msg_phys;
3101	bus_addr_t		paddr;
3102	int			build_native_sgl = 0, first_prp_entry;
3103	int			prp_size = PRP_ENTRY_SIZE;
3104	Mpi25IeeeSgeChain64_t	*main_chain_element = NULL;
3105	struct mpr_prp_page	*prp_page_info = NULL;
3106
3107	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
3108
3109	/*
3110	 * Add up the sizes of each segment length to get the total transfer
3111	 * size, which will be checked against the Maximum Data Transfer Size.
3112	 * If the data transfer length exceeds the MDTS for this device, just
3113	 * return 1 so a normal IEEE SGL will be built. F/W will break the I/O
3114	 * up into multiple I/O's. [nvme_mdts = 0 means unlimited]
3115	 */
3116	for (i = 0; i < segs_left; i++)
3117		buff_len += htole32(segs[i].ds_len);
3118	if ((cm->cm_targ->MDTS > 0) && (buff_len > cm->cm_targ->MDTS))
3119		return 1;
3120
3121	/* Create page_mask (to get offset within page) */
3122	page_mask = PAGE_SIZE - 1;
3123
3124	/*
3125	 * Check if the number of elements exceeds the max number that can be
3126	 * put in the main message frame (H/W can only translate an SGL that
3127	 * is contained entirely in the main message frame).
3128	 */
3129	sges_in_segment = (sc->reqframesz -
3130	    offsetof(Mpi25SCSIIORequest_t, SGL)) / sizeof(MPI25_SGE_IO_UNION);
3131	if (segs_left > sges_in_segment)
3132		build_native_sgl = 1;
3133	else
3134	{
3135		/*
3136		 * NVMe uses one PRP for each physical page (or part of physical
3137		 * page).
3138		 *    if 4 pages or less then IEEE is OK
3139		 *    if > 5 pages then we need to build a native SGL
3140		 *    if > 4 and <= 5 pages, then check the physical address of
3141		 *      the first SG entry, then if this first size in the page
3142		 *      is >= the residual beyond 4 pages then use IEEE,
3143		 *      otherwise use native SGL
3144		 */
3145		if (buff_len > (PAGE_SIZE * 5))
3146			build_native_sgl = 1;
3147		else if ((buff_len > (PAGE_SIZE * 4)) &&
3148		    (buff_len <= (PAGE_SIZE * 5)) )
3149		{
3150			msg_phys = (uint64_t *)(uintptr_t)segs[0].ds_addr;
3151			first_page_offset =
3152			    ((uint32_t)(uint64_t)(uintptr_t)msg_phys &
3153			    page_mask);
3154			first_page_data_size = PAGE_SIZE - first_page_offset;
3155			end_residual = buff_len % PAGE_SIZE;
3156
3157			/*
3158			 * If offset into first page pushes the end of the data
3159			 * beyond end of the 5th page, we need the extra PRP
3160			 * list.
3161			 */
3162			if (first_page_data_size < end_residual)
3163				build_native_sgl = 1;
3164
3165			/*
3166			 * Check if first SG entry size is < residual beyond 4
3167			 * pages.
3168			 */
3169			if (htole32(segs[0].ds_len) <
3170			    (buff_len - (PAGE_SIZE * 4)))
3171				build_native_sgl = 1;
3172		}
3173	}
3174
3175	/* check if native SGL is needed */
3176	if (!build_native_sgl)
3177		return 1;
3178
3179	/*
3180	 * Native SGL is needed.
3181	 * Put a chain element in main message frame that points to the first
3182	 * chain buffer.
3183	 *
3184	 * NOTE:  The ChainOffset field must be 0 when using a chain pointer to
3185	 *        a native SGL.
3186	 */
3187
3188	/* Set main message chain element pointer */
3189	main_chain_element = (pMpi25IeeeSgeChain64_t)cm->cm_sge;
3190
3191	/*
3192	 * For NVMe the chain element needs to be the 2nd SGL entry in the main
3193	 * message.
3194	 */
3195	main_chain_element = (Mpi25IeeeSgeChain64_t *)
3196	    ((uint8_t *)main_chain_element + sizeof(MPI25_IEEE_SGE_CHAIN64));
3197
3198	/*
3199	 * For the PRP entries, use the specially allocated buffer of
3200	 * contiguous memory. PRP Page allocation failures should not happen
3201	 * because there should be enough PRP page buffers to account for the
3202	 * possible NVMe QDepth.
3203	 */
3204	prp_page_info = mpr_alloc_prp_page(sc);
3205	KASSERT(prp_page_info != NULL, ("%s: There are no PRP Pages left to be "
3206	    "used for building a native NVMe SGL.\n", __func__));
3207	curr_buff = (uint32_t *)prp_page_info->prp_page;
3208	msg_phys = (uint64_t *)(uintptr_t)prp_page_info->prp_page_busaddr;
3209
3210	/*
3211	 * Insert the allocated PRP page into the command's PRP page list. This
3212	 * will be freed when the command is freed.
3213	 */
3214	TAILQ_INSERT_TAIL(&cm->cm_prp_page_list, prp_page_info, prp_page_link);
3215
3216	/*
3217	 * Check if we are within 1 entry of a page boundary we don't want our
3218	 * first entry to be a PRP List entry.
3219	 */
3220	page_mask_result = (uintptr_t)((uint8_t *)curr_buff + prp_size) &
3221	    page_mask;
3222	if (!page_mask_result) {
3223		/* Bump up to next page boundary. */
3224		curr_buff = (uint32_t *)((uint8_t *)curr_buff + prp_size);
3225		msg_phys = (uint64_t *)((uint8_t *)msg_phys + prp_size);
3226	}
3227
3228	/* Fill in the chain element and make it an NVMe segment type. */
3229	main_chain_element->Address.High =
3230	    htole32((uint32_t)((uint64_t)(uintptr_t)msg_phys >> 32));
3231	main_chain_element->Address.Low =
3232	    htole32((uint32_t)(uintptr_t)msg_phys);
3233	main_chain_element->NextChainOffset = 0;
3234	main_chain_element->Flags = MPI2_IEEE_SGE_FLAGS_CHAIN_ELEMENT |
3235	    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR |
3236	    MPI26_IEEE_SGE_FLAGS_NSF_NVME_PRP;
3237
3238	/* Set SGL pointer to start of contiguous PCIe buffer. */
3239	ptr_sgl = curr_buff;
3240	sge_dwords = 2;
3241	num_entries = 0;
3242
3243	/*
3244	 * NVMe has a very convoluted PRP format. One PRP is required for each
3245	 * page or partial page. We need to split up OS SG entries if they are
3246	 * longer than one page or cross a page boundary. We also have to insert
3247	 * a PRP list pointer entry as the last entry in each physical page of
3248	 * the PRP list.
3249	 *
3250	 * NOTE: The first PRP "entry" is actually placed in the first SGL entry
3251	 * in the main message in IEEE 64 format. The 2nd entry in the main
3252	 * message is the chain element, and the rest of the PRP entries are
3253	 * built in the contiguous PCIe buffer.
3254	 */
3255	first_prp_entry = 1;
3256	ptr_first_sgl = (uint32_t *)cm->cm_sge;
3257
3258	for (i = 0; i < segs_left; i++) {
3259		/* Get physical address and length of this SG entry. */
3260		paddr = segs[i].ds_addr;
3261		length = segs[i].ds_len;
3262
3263		/*
3264		 * Check whether a given SGE buffer lies on a non-PAGED
3265		 * boundary if this is not the first page. If so, this is not
3266		 * expected so have FW build the SGL.
3267		 */
3268		if ((i != 0) && (((uint32_t)paddr & page_mask) != 0)) {
3269			mpr_dprint(sc, MPR_ERROR, "Unaligned SGE while "
3270			    "building NVMe PRPs, low address is 0x%x\n",
3271			    (uint32_t)paddr);
3272			return 1;
3273		}
3274
3275		/* Apart from last SGE, if any other SGE boundary is not page
3276		 * aligned then it means that hole exists. Existence of hole
3277		 * leads to data corruption. So fallback to IEEE SGEs.
3278		 */
3279		if (i != (segs_left - 1)) {
3280			if (((uint32_t)paddr + length) & page_mask) {
3281				mpr_dprint(sc, MPR_ERROR, "Unaligned SGE "
3282				    "boundary while building NVMe PRPs, low "
3283				    "address: 0x%x and length: %u\n",
3284				    (uint32_t)paddr, length);
3285				return 1;
3286			}
3287		}
3288
3289		/* Loop while the length is not zero. */
3290		while (length) {
3291			/*
3292			 * Check if we need to put a list pointer here if we are
3293			 * at page boundary - prp_size.
3294			 */
3295			page_mask_result = (uintptr_t)((uint8_t *)ptr_sgl +
3296			    prp_size) & page_mask;
3297			if (!page_mask_result) {
3298				/*
3299				 * Need to put a PRP list pointer here.
3300				 */
3301				msg_phys = (uint64_t *)((uint8_t *)msg_phys +
3302				    prp_size);
3303				*ptr_sgl = htole32((uintptr_t)msg_phys);
3304				*(ptr_sgl+1) = htole32((uint64_t)(uintptr_t)
3305				    msg_phys >> 32);
3306				ptr_sgl += sge_dwords;
3307				num_entries++;
3308			}
3309
3310			/* Need to handle if entry will be part of a page. */
3311			offset = (uint32_t)paddr & page_mask;
3312			entry_len = PAGE_SIZE - offset;
3313			if (first_prp_entry) {
3314				/*
3315				 * Put IEEE entry in first SGE in main message.
3316				 * (Simple element, System addr, not end of
3317				 * list.)
3318				 */
3319				*ptr_first_sgl = htole32((uint32_t)paddr);
3320				*(ptr_first_sgl + 1) =
3321				    htole32((uint32_t)((uint64_t)paddr >> 32));
3322				*(ptr_first_sgl + 2) = htole32(entry_len);
3323				*(ptr_first_sgl + 3) = 0;
3324
3325				/* No longer the first PRP entry. */
3326				first_prp_entry = 0;
3327			} else {
3328				/* Put entry in list. */
3329				*ptr_sgl = htole32((uint32_t)paddr);
3330				*(ptr_sgl + 1) =
3331				    htole32((uint32_t)((uint64_t)paddr >> 32));
3332
3333				/* Bump ptr_sgl, msg_phys, and num_entries. */
3334				ptr_sgl += sge_dwords;
3335				msg_phys = (uint64_t *)((uint8_t *)msg_phys +
3336				    prp_size);
3337				num_entries++;
3338			}
3339
3340			/* Bump the phys address by the entry_len. */
3341			paddr += entry_len;
3342
3343			/* Decrement length accounting for last partial page. */
3344			if (entry_len > length)
3345				length = 0;
3346			else
3347				length -= entry_len;
3348		}
3349	}
3350
3351	/* Set chain element Length. */
3352	main_chain_element->Length = htole32(num_entries * prp_size);
3353
3354	/* Return 0, indicating we built a native SGL. */
3355	return 0;
3356}
3357
3358/*
3359 * Add a chain element as the next SGE for the specified command.
3360 * Reset cm_sge and cm_sgesize to indicate all the available space. Chains are
3361 * only required for IEEE commands.  Therefore there is no code for commands
3362 * that have the MPR_CM_FLAGS_SGE_SIMPLE flag set (and those commands
3363 * shouldn't be requesting chains).
3364 */
3365static int
3366mpr_add_chain(struct mpr_command *cm, int segsleft)
3367{
3368	struct mpr_softc *sc = cm->cm_sc;
3369	MPI2_REQUEST_HEADER *req;
3370	MPI25_IEEE_SGE_CHAIN64 *ieee_sgc;
3371	struct mpr_chain *chain;
3372	int sgc_size, current_segs, rem_segs, segs_per_frame;
3373	uint8_t next_chain_offset = 0;
3374
3375	/*
3376	 * Fail if a command is requesting a chain for SIMPLE SGE's.  For SAS3
3377	 * only IEEE commands should be requesting chains.  Return some error
3378	 * code other than 0.
3379	 */
3380	if (cm->cm_flags & MPR_CM_FLAGS_SGE_SIMPLE) {
3381		mpr_dprint(sc, MPR_ERROR, "A chain element cannot be added to "
3382		    "an MPI SGL.\n");
3383		return(ENOBUFS);
3384	}
3385
3386	sgc_size = sizeof(MPI25_IEEE_SGE_CHAIN64);
3387	if (cm->cm_sglsize < sgc_size)
3388		panic("MPR: Need SGE Error Code\n");
3389
3390	chain = mpr_alloc_chain(cm->cm_sc);
3391	if (chain == NULL)
3392		return (ENOBUFS);
3393
3394	/*
3395	 * Note: a double-linked list is used to make it easier to walk for
3396	 * debugging.
3397	 */
3398	TAILQ_INSERT_TAIL(&cm->cm_chain_list, chain, chain_link);
3399
3400	/*
3401	 * Need to know if the number of frames left is more than 1 or not.  If
3402	 * more than 1 frame is required, NextChainOffset will need to be set,
3403	 * which will just be the last segment of the frame.
3404	 */
3405	rem_segs = 0;
3406	if (cm->cm_sglsize < (sgc_size * segsleft)) {
3407		/*
3408		 * rem_segs is the number of segment remaining after the
3409		 * segments that will go into the current frame.  Since it is
3410		 * known that at least one more frame is required, account for
3411		 * the chain element.  To know if more than one more frame is
3412		 * required, just check if there will be a remainder after using
3413		 * the current frame (with this chain) and the next frame.  If
3414		 * so the NextChainOffset must be the last element of the next
3415		 * frame.
3416		 */
3417		current_segs = (cm->cm_sglsize / sgc_size) - 1;
3418		rem_segs = segsleft - current_segs;
3419		segs_per_frame = sc->chain_frame_size / sgc_size;
3420		if (rem_segs > segs_per_frame) {
3421			next_chain_offset = segs_per_frame - 1;
3422		}
3423	}
3424	ieee_sgc = &((MPI25_SGE_IO_UNION *)cm->cm_sge)->IeeeChain;
3425	ieee_sgc->Length = next_chain_offset ?
3426	    htole32((uint32_t)sc->chain_frame_size) :
3427	    htole32((uint32_t)rem_segs * (uint32_t)sgc_size);
3428	ieee_sgc->NextChainOffset = next_chain_offset;
3429	ieee_sgc->Flags = (MPI2_IEEE_SGE_FLAGS_CHAIN_ELEMENT |
3430	    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
3431	ieee_sgc->Address.Low = htole32(chain->chain_busaddr);
3432	ieee_sgc->Address.High = htole32(chain->chain_busaddr >> 32);
3433	cm->cm_sge = &((MPI25_SGE_IO_UNION *)chain->chain)->IeeeSimple;
3434	req = (MPI2_REQUEST_HEADER *)cm->cm_req;
3435	req->ChainOffset = (sc->chain_frame_size - sgc_size) >> 4;
3436
3437	cm->cm_sglsize = sc->chain_frame_size;
3438	return (0);
3439}
3440
3441/*
3442 * Add one scatter-gather element to the scatter-gather list for a command.
3443 * Maintain cm_sglsize and cm_sge as the remaining size and pointer to the
3444 * next SGE to fill in, respectively.  In Gen3, the MPI SGL does not have a
3445 * chain, so don't consider any chain additions.
3446 */
3447int
3448mpr_push_sge(struct mpr_command *cm, MPI2_SGE_SIMPLE64 *sge, size_t len,
3449    int segsleft)
3450{
3451	uint32_t saved_buf_len, saved_address_low, saved_address_high;
3452	u32 sge_flags;
3453
3454	/*
3455	 * case 1: >=1 more segment, no room for anything (error)
3456	 * case 2: 1 more segment and enough room for it
3457         */
3458
3459	if (cm->cm_sglsize < (segsleft * sizeof(MPI2_SGE_SIMPLE64))) {
3460		mpr_dprint(cm->cm_sc, MPR_ERROR,
3461		    "%s: warning: Not enough room for MPI SGL in frame.\n",
3462		    __func__);
3463		return(ENOBUFS);
3464	}
3465
3466	KASSERT(segsleft == 1,
3467	    ("segsleft cannot be more than 1 for an MPI SGL; segsleft = %d\n",
3468	    segsleft));
3469
3470	/*
3471	 * There is one more segment left to add for the MPI SGL and there is
3472	 * enough room in the frame to add it.  This is the normal case because
3473	 * MPI SGL's don't have chains, otherwise something is wrong.
3474	 *
3475	 * If this is a bi-directional request, need to account for that
3476	 * here.  Save the pre-filled sge values.  These will be used
3477	 * either for the 2nd SGL or for a single direction SGL.  If
3478	 * cm_out_len is non-zero, this is a bi-directional request, so
3479	 * fill in the OUT SGL first, then the IN SGL, otherwise just
3480	 * fill in the IN SGL.  Note that at this time, when filling in
3481	 * 2 SGL's for a bi-directional request, they both use the same
3482	 * DMA buffer (same cm command).
3483	 */
3484	saved_buf_len = sge->FlagsLength & 0x00FFFFFF;
3485	saved_address_low = sge->Address.Low;
3486	saved_address_high = sge->Address.High;
3487	if (cm->cm_out_len) {
3488		sge->FlagsLength = cm->cm_out_len |
3489		    ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
3490		    MPI2_SGE_FLAGS_END_OF_BUFFER |
3491		    MPI2_SGE_FLAGS_HOST_TO_IOC |
3492		    MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
3493		    MPI2_SGE_FLAGS_SHIFT);
3494		cm->cm_sglsize -= len;
3495		/* Endian Safe code */
3496		sge_flags = sge->FlagsLength;
3497		sge->FlagsLength = htole32(sge_flags);
3498		bcopy(sge, cm->cm_sge, len);
3499		cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
3500	}
3501	sge->FlagsLength = saved_buf_len |
3502	    ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
3503	    MPI2_SGE_FLAGS_END_OF_BUFFER |
3504	    MPI2_SGE_FLAGS_LAST_ELEMENT |
3505	    MPI2_SGE_FLAGS_END_OF_LIST |
3506	    MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
3507	    MPI2_SGE_FLAGS_SHIFT);
3508	if (cm->cm_flags & MPR_CM_FLAGS_DATAIN) {
3509		sge->FlagsLength |=
3510		    ((uint32_t)(MPI2_SGE_FLAGS_IOC_TO_HOST) <<
3511		    MPI2_SGE_FLAGS_SHIFT);
3512	} else {
3513		sge->FlagsLength |=
3514		    ((uint32_t)(MPI2_SGE_FLAGS_HOST_TO_IOC) <<
3515		    MPI2_SGE_FLAGS_SHIFT);
3516	}
3517	sge->Address.Low = saved_address_low;
3518	sge->Address.High = saved_address_high;
3519
3520	cm->cm_sglsize -= len;
3521	/* Endian Safe code */
3522	sge_flags = sge->FlagsLength;
3523	sge->FlagsLength = htole32(sge_flags);
3524	bcopy(sge, cm->cm_sge, len);
3525	cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
3526	return (0);
3527}
3528
3529/*
3530 * Add one IEEE scatter-gather element (chain or simple) to the IEEE scatter-
3531 * gather list for a command.  Maintain cm_sglsize and cm_sge as the
3532 * remaining size and pointer to the next SGE to fill in, respectively.
3533 */
3534int
3535mpr_push_ieee_sge(struct mpr_command *cm, void *sgep, int segsleft)
3536{
3537	MPI2_IEEE_SGE_SIMPLE64 *sge = sgep;
3538	int error, ieee_sge_size = sizeof(MPI25_SGE_IO_UNION);
3539	uint32_t saved_buf_len, saved_address_low, saved_address_high;
3540	uint32_t sge_length;
3541
3542	/*
3543	 * case 1: No room for chain or segment (error).
3544	 * case 2: Two or more segments left but only room for chain.
3545	 * case 3: Last segment and room for it, so set flags.
3546	 */
3547
3548	/*
3549	 * There should be room for at least one element, or there is a big
3550	 * problem.
3551	 */
3552	if (cm->cm_sglsize < ieee_sge_size)
3553		panic("MPR: Need SGE Error Code\n");
3554
3555	if ((segsleft >= 2) && (cm->cm_sglsize < (ieee_sge_size * 2))) {
3556		if ((error = mpr_add_chain(cm, segsleft)) != 0)
3557			return (error);
3558	}
3559
3560	if (segsleft == 1) {
3561		/*
3562		 * If this is a bi-directional request, need to account for that
3563		 * here.  Save the pre-filled sge values.  These will be used
3564		 * either for the 2nd SGL or for a single direction SGL.  If
3565		 * cm_out_len is non-zero, this is a bi-directional request, so
3566		 * fill in the OUT SGL first, then the IN SGL, otherwise just
3567		 * fill in the IN SGL.  Note that at this time, when filling in
3568		 * 2 SGL's for a bi-directional request, they both use the same
3569		 * DMA buffer (same cm command).
3570		 */
3571		saved_buf_len = sge->Length;
3572		saved_address_low = sge->Address.Low;
3573		saved_address_high = sge->Address.High;
3574		if (cm->cm_out_len) {
3575			sge->Length = cm->cm_out_len;
3576			sge->Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT |
3577			    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
3578			cm->cm_sglsize -= ieee_sge_size;
3579			/* Endian Safe code */
3580			sge_length = sge->Length;
3581			sge->Length = htole32(sge_length);
3582			bcopy(sgep, cm->cm_sge, ieee_sge_size);
3583			cm->cm_sge =
3584			    (MPI25_SGE_IO_UNION *)((uintptr_t)cm->cm_sge +
3585			    ieee_sge_size);
3586		}
3587		sge->Length = saved_buf_len;
3588		sge->Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT |
3589		    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR |
3590		    MPI25_IEEE_SGE_FLAGS_END_OF_LIST);
3591		sge->Address.Low = saved_address_low;
3592		sge->Address.High = saved_address_high;
3593	}
3594
3595	cm->cm_sglsize -= ieee_sge_size;
3596	/* Endian Safe code */
3597	sge_length = sge->Length;
3598	sge->Length = htole32(sge_length);
3599	bcopy(sgep, cm->cm_sge, ieee_sge_size);
3600	cm->cm_sge = (MPI25_SGE_IO_UNION *)((uintptr_t)cm->cm_sge +
3601	    ieee_sge_size);
3602	return (0);
3603}
3604
3605/*
3606 * Add one dma segment to the scatter-gather list for a command.
3607 */
3608int
3609mpr_add_dmaseg(struct mpr_command *cm, vm_paddr_t pa, size_t len, u_int flags,
3610    int segsleft)
3611{
3612	MPI2_SGE_SIMPLE64 sge;
3613	MPI2_IEEE_SGE_SIMPLE64 ieee_sge;
3614
3615	if (!(cm->cm_flags & MPR_CM_FLAGS_SGE_SIMPLE)) {
3616		ieee_sge.Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT |
3617		    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
3618		ieee_sge.Length = len;
3619		mpr_from_u64(pa, &ieee_sge.Address);
3620
3621		return (mpr_push_ieee_sge(cm, &ieee_sge, segsleft));
3622	} else {
3623		/*
3624		 * This driver always uses 64-bit address elements for
3625		 * simplicity.
3626		 */
3627		flags |= MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
3628		    MPI2_SGE_FLAGS_64_BIT_ADDRESSING;
3629		/* Set Endian safe macro in mpr_push_sge */
3630		sge.FlagsLength = len | (flags << MPI2_SGE_FLAGS_SHIFT);
3631		mpr_from_u64(pa, &sge.Address);
3632
3633		return (mpr_push_sge(cm, &sge, sizeof sge, segsleft));
3634	}
3635}
3636
3637static void
3638mpr_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
3639{
3640	struct mpr_softc *sc;
3641	struct mpr_command *cm;
3642	u_int i, dir, sflags;
3643
3644	cm = (struct mpr_command *)arg;
3645	sc = cm->cm_sc;
3646
3647	/*
3648	 * In this case, just print out a warning and let the chip tell the
3649	 * user they did the wrong thing.
3650	 */
3651	if ((cm->cm_max_segs != 0) && (nsegs > cm->cm_max_segs)) {
3652		mpr_dprint(sc, MPR_ERROR, "%s: warning: busdma returned %d "
3653		    "segments, more than the %d allowed\n", __func__, nsegs,
3654		    cm->cm_max_segs);
3655	}
3656
3657	/*
3658	 * Set up DMA direction flags.  Bi-directional requests are also handled
3659	 * here.  In that case, both direction flags will be set.
3660	 */
3661	sflags = 0;
3662	if (cm->cm_flags & MPR_CM_FLAGS_SMP_PASS) {
3663		/*
3664		 * We have to add a special case for SMP passthrough, there
3665		 * is no easy way to generically handle it.  The first
3666		 * S/G element is used for the command (therefore the
3667		 * direction bit needs to be set).  The second one is used
3668		 * for the reply.  We'll leave it to the caller to make
3669		 * sure we only have two buffers.
3670		 */
3671		/*
3672		 * Even though the busdma man page says it doesn't make
3673		 * sense to have both direction flags, it does in this case.
3674		 * We have one s/g element being accessed in each direction.
3675		 */
3676		dir = BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD;
3677
3678		/*
3679		 * Set the direction flag on the first buffer in the SMP
3680		 * passthrough request.  We'll clear it for the second one.
3681		 */
3682		sflags |= MPI2_SGE_FLAGS_DIRECTION |
3683			  MPI2_SGE_FLAGS_END_OF_BUFFER;
3684	} else if (cm->cm_flags & MPR_CM_FLAGS_DATAOUT) {
3685		sflags |= MPI2_SGE_FLAGS_HOST_TO_IOC;
3686		dir = BUS_DMASYNC_PREWRITE;
3687	} else
3688		dir = BUS_DMASYNC_PREREAD;
3689
3690	/* Check if a native SG list is needed for an NVMe PCIe device. */
3691	if (cm->cm_targ && cm->cm_targ->is_nvme &&
3692	    mpr_check_pcie_native_sgl(sc, cm, segs, nsegs) == 0) {
3693		/* A native SG list was built, skip to end. */
3694		goto out;
3695	}
3696
3697	for (i = 0; i < nsegs; i++) {
3698		if ((cm->cm_flags & MPR_CM_FLAGS_SMP_PASS) && (i != 0)) {
3699			sflags &= ~MPI2_SGE_FLAGS_DIRECTION;
3700		}
3701		error = mpr_add_dmaseg(cm, segs[i].ds_addr, segs[i].ds_len,
3702		    sflags, nsegs - i);
3703		if (error != 0) {
3704			/* Resource shortage, roll back! */
3705			if (ratecheck(&sc->lastfail, &mpr_chainfail_interval))
3706				mpr_dprint(sc, MPR_INFO, "Out of chain frames, "
3707				    "consider increasing hw.mpr.max_chains.\n");
3708			cm->cm_flags |= MPR_CM_FLAGS_CHAIN_FAILED;
3709			/*
3710			 * mpr_complete_command can only be called on commands
3711			 * that are in the queue. Since this is an error path
3712			 * which gets called before we enqueue, update the state
3713			 * to meet this requirement before we complete it.
3714			 */
3715			cm->cm_state = MPR_CM_STATE_INQUEUE;
3716			mpr_complete_command(sc, cm);
3717			return;
3718		}
3719	}
3720
3721out:
3722	bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir);
3723	mpr_enqueue_request(sc, cm);
3724
3725	return;
3726}
3727
3728static void
3729mpr_data_cb2(void *arg, bus_dma_segment_t *segs, int nsegs, bus_size_t mapsize,
3730	     int error)
3731{
3732	mpr_data_cb(arg, segs, nsegs, error);
3733}
3734
3735/*
3736 * This is the routine to enqueue commands ansynchronously.
3737 * Note that the only error path here is from bus_dmamap_load(), which can
3738 * return EINPROGRESS if it is waiting for resources.  Other than this, it's
3739 * assumed that if you have a command in-hand, then you have enough credits
3740 * to use it.
3741 */
3742int
3743mpr_map_command(struct mpr_softc *sc, struct mpr_command *cm)
3744{
3745	int error = 0;
3746
3747	if (cm->cm_flags & MPR_CM_FLAGS_USE_UIO) {
3748		error = bus_dmamap_load_uio(sc->buffer_dmat, cm->cm_dmamap,
3749		    &cm->cm_uio, mpr_data_cb2, cm, 0);
3750	} else if (cm->cm_flags & MPR_CM_FLAGS_USE_CCB) {
3751		error = bus_dmamap_load_ccb(sc->buffer_dmat, cm->cm_dmamap,
3752		    cm->cm_data, mpr_data_cb, cm, 0);
3753	} else if ((cm->cm_data != NULL) && (cm->cm_length != 0)) {
3754		error = bus_dmamap_load(sc->buffer_dmat, cm->cm_dmamap,
3755		    cm->cm_data, cm->cm_length, mpr_data_cb, cm, 0);
3756	} else {
3757		/* Add a zero-length element as needed */
3758		if (cm->cm_sge != NULL)
3759			mpr_add_dmaseg(cm, 0, 0, 0, 1);
3760		mpr_enqueue_request(sc, cm);
3761	}
3762
3763	return (error);
3764}
3765
3766/*
3767 * This is the routine to enqueue commands synchronously.  An error of
3768 * EINPROGRESS from mpr_map_command() is ignored since the command will
3769 * be executed and enqueued automatically.  Other errors come from msleep().
3770 */
3771int
3772mpr_wait_command(struct mpr_softc *sc, struct mpr_command **cmp, int timeout,
3773    int sleep_flag)
3774{
3775	int error, rc;
3776	struct timeval cur_time, start_time;
3777	struct mpr_command *cm = *cmp;
3778
3779	if (sc->mpr_flags & MPR_FLAGS_DIAGRESET)
3780		return  EBUSY;
3781
3782	cm->cm_complete = NULL;
3783	cm->cm_flags |= (MPR_CM_FLAGS_WAKEUP + MPR_CM_FLAGS_POLLED);
3784	error = mpr_map_command(sc, cm);
3785	if ((error != 0) && (error != EINPROGRESS))
3786		return (error);
3787
3788	// Check for context and wait for 50 mSec at a time until time has
3789	// expired or the command has finished.  If msleep can't be used, need
3790	// to poll.
3791	if (curthread->td_no_sleeping)
3792		sleep_flag = NO_SLEEP;
3793	getmicrouptime(&start_time);
3794	if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP) {
3795		error = msleep(cm, &sc->mpr_mtx, 0, "mprwait", timeout*hz);
3796		if (error == EWOULDBLOCK) {
3797			/*
3798			 * Record the actual elapsed time in the case of a
3799			 * timeout for the message below.
3800			 */
3801			getmicrouptime(&cur_time);
3802			timevalsub(&cur_time, &start_time);
3803		}
3804	} else {
3805		while ((cm->cm_flags & MPR_CM_FLAGS_COMPLETE) == 0) {
3806			mpr_intr_locked(sc);
3807			if (sleep_flag == CAN_SLEEP)
3808				pause("mprwait", hz/20);
3809			else
3810				DELAY(50000);
3811
3812			getmicrouptime(&cur_time);
3813			timevalsub(&cur_time, &start_time);
3814			if (cur_time.tv_sec > timeout) {
3815				error = EWOULDBLOCK;
3816				break;
3817			}
3818		}
3819	}
3820
3821	if (error == EWOULDBLOCK) {
3822		if (cm->cm_timeout_handler == NULL) {
3823			mpr_dprint(sc, MPR_FAULT, "Calling Reinit from %s, timeout=%d,"
3824			    " elapsed=%jd\n", __func__, timeout,
3825			    (intmax_t)cur_time.tv_sec);
3826			rc = mpr_reinit(sc);
3827			mpr_dprint(sc, MPR_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
3828			    "failed");
3829		} else
3830			cm->cm_timeout_handler(sc, cm);
3831		if (sc->mpr_flags & MPR_FLAGS_REALLOCATED) {
3832			/*
3833			 * Tell the caller that we freed the command in a
3834			 * reinit.
3835			 */
3836			*cmp = NULL;
3837		}
3838		error = ETIMEDOUT;
3839	}
3840	return (error);
3841}
3842
3843/*
3844 * This is the routine to enqueue a command synchonously and poll for
3845 * completion.  Its use should be rare.
3846 */
3847int
3848mpr_request_polled(struct mpr_softc *sc, struct mpr_command **cmp)
3849{
3850	int error, rc;
3851	struct timeval cur_time, start_time;
3852	struct mpr_command *cm = *cmp;
3853
3854	error = 0;
3855
3856	cm->cm_flags |= MPR_CM_FLAGS_POLLED;
3857	cm->cm_complete = NULL;
3858	mpr_map_command(sc, cm);
3859
3860	getmicrouptime(&start_time);
3861	while ((cm->cm_flags & MPR_CM_FLAGS_COMPLETE) == 0) {
3862		mpr_intr_locked(sc);
3863
3864		if (mtx_owned(&sc->mpr_mtx))
3865			msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
3866			    "mprpoll", hz/20);
3867		else
3868			pause("mprpoll", hz/20);
3869
3870		/*
3871		 * Check for real-time timeout and fail if more than 60 seconds.
3872		 */
3873		getmicrouptime(&cur_time);
3874		timevalsub(&cur_time, &start_time);
3875		if (cur_time.tv_sec > 60) {
3876			mpr_dprint(sc, MPR_FAULT, "polling failed\n");
3877			error = ETIMEDOUT;
3878			break;
3879		}
3880	}
3881	cm->cm_state = MPR_CM_STATE_BUSY;
3882	if (error) {
3883		mpr_dprint(sc, MPR_FAULT, "Calling Reinit from %s\n", __func__);
3884		rc = mpr_reinit(sc);
3885		mpr_dprint(sc, MPR_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
3886		    "failed");
3887
3888		if (sc->mpr_flags & MPR_FLAGS_REALLOCATED) {
3889			/*
3890			 * Tell the caller that we freed the command in a
3891			 * reinit.
3892			 */
3893			*cmp = NULL;
3894		}
3895	}
3896	return (error);
3897}
3898
3899/*
3900 * The MPT driver had a verbose interface for config pages.  In this driver,
3901 * reduce it to much simpler terms, similar to the Linux driver.
3902 */
3903int
3904mpr_read_config_page(struct mpr_softc *sc, struct mpr_config_params *params)
3905{
3906	MPI2_CONFIG_REQUEST *req;
3907	struct mpr_command *cm;
3908	int error;
3909
3910	if (sc->mpr_flags & MPR_FLAGS_BUSY) {
3911		return (EBUSY);
3912	}
3913
3914	cm = mpr_alloc_command(sc);
3915	if (cm == NULL) {
3916		return (EBUSY);
3917	}
3918
3919	req = (MPI2_CONFIG_REQUEST *)cm->cm_req;
3920	req->Function = MPI2_FUNCTION_CONFIG;
3921	req->Action = params->action;
3922	req->SGLFlags = 0;
3923	req->ChainOffset = 0;
3924	req->PageAddress = params->page_address;
3925	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
3926		MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr;
3927
3928		hdr = &params->hdr.Ext;
3929		req->ExtPageType = hdr->ExtPageType;
3930		req->ExtPageLength = hdr->ExtPageLength;
3931		req->Header.PageType = MPI2_CONFIG_PAGETYPE_EXTENDED;
3932		req->Header.PageLength = 0; /* Must be set to zero */
3933		req->Header.PageNumber = hdr->PageNumber;
3934		req->Header.PageVersion = hdr->PageVersion;
3935	} else {
3936		MPI2_CONFIG_PAGE_HEADER *hdr;
3937
3938		hdr = &params->hdr.Struct;
3939		req->Header.PageType = hdr->PageType;
3940		req->Header.PageNumber = hdr->PageNumber;
3941		req->Header.PageLength = hdr->PageLength;
3942		req->Header.PageVersion = hdr->PageVersion;
3943	}
3944
3945	cm->cm_data = params->buffer;
3946	cm->cm_length = params->length;
3947	if (cm->cm_data != NULL) {
3948		cm->cm_sge = &req->PageBufferSGE;
3949		cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION);
3950		cm->cm_flags = MPR_CM_FLAGS_SGE_SIMPLE | MPR_CM_FLAGS_DATAIN;
3951	} else
3952		cm->cm_sge = NULL;
3953	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
3954
3955	cm->cm_complete_data = params;
3956	if (params->callback != NULL) {
3957		cm->cm_complete = mpr_config_complete;
3958		return (mpr_map_command(sc, cm));
3959	} else {
3960		error = mpr_wait_command(sc, &cm, 0, CAN_SLEEP);
3961		if (error) {
3962			mpr_dprint(sc, MPR_FAULT,
3963			    "Error %d reading config page\n", error);
3964			if (cm != NULL)
3965				mpr_free_command(sc, cm);
3966			return (error);
3967		}
3968		mpr_config_complete(sc, cm);
3969	}
3970
3971	return (0);
3972}
3973
3974int
3975mpr_write_config_page(struct mpr_softc *sc, struct mpr_config_params *params)
3976{
3977	return (EINVAL);
3978}
3979
3980static void
3981mpr_config_complete(struct mpr_softc *sc, struct mpr_command *cm)
3982{
3983	MPI2_CONFIG_REPLY *reply;
3984	struct mpr_config_params *params;
3985
3986	MPR_FUNCTRACE(sc);
3987	params = cm->cm_complete_data;
3988
3989	if (cm->cm_data != NULL) {
3990		bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap,
3991		    BUS_DMASYNC_POSTREAD);
3992		bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
3993	}
3994
3995	/*
3996	 * XXX KDM need to do more error recovery?  This results in the
3997	 * device in question not getting probed.
3998	 */
3999	if ((cm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) {
4000		params->status = MPI2_IOCSTATUS_BUSY;
4001		goto done;
4002	}
4003
4004	reply = (MPI2_CONFIG_REPLY *)cm->cm_reply;
4005	if (reply == NULL) {
4006		params->status = MPI2_IOCSTATUS_BUSY;
4007		goto done;
4008	}
4009	params->status = reply->IOCStatus;
4010	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
4011		params->hdr.Ext.ExtPageType = reply->ExtPageType;
4012		params->hdr.Ext.ExtPageLength = reply->ExtPageLength;
4013		params->hdr.Ext.PageType = reply->Header.PageType;
4014		params->hdr.Ext.PageNumber = reply->Header.PageNumber;
4015		params->hdr.Ext.PageVersion = reply->Header.PageVersion;
4016	} else {
4017		params->hdr.Struct.PageType = reply->Header.PageType;
4018		params->hdr.Struct.PageNumber = reply->Header.PageNumber;
4019		params->hdr.Struct.PageLength = reply->Header.PageLength;
4020		params->hdr.Struct.PageVersion = reply->Header.PageVersion;
4021	}
4022
4023done:
4024	mpr_free_command(sc, cm);
4025	if (params->callback != NULL)
4026		params->callback(sc, params);
4027
4028	return;
4029}
4030