1/*-
2 * Copyright (c) 2009 Yahoo! Inc.
3 * Copyright (c) 2011-2015 LSI Corp.
4 * Copyright (c) 2013-2016 Avago Technologies
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * Avago Technologies (LSI) MPT-Fusion Host Adapter FreeBSD
29 *
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: stable/11/sys/dev/mpr/mpr.c 329189 2018-02-13 02:11:39Z mav $");
34
35/* Communications core for Avago Technologies (LSI) MPT3 */
36
37/* TODO Move headers to mprvar */
38#include <sys/types.h>
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/kernel.h>
42#include <sys/selinfo.h>
43#include <sys/lock.h>
44#include <sys/mutex.h>
45#include <sys/module.h>
46#include <sys/bus.h>
47#include <sys/conf.h>
48#include <sys/bio.h>
49#include <sys/malloc.h>
50#include <sys/uio.h>
51#include <sys/sysctl.h>
52#include <sys/queue.h>
53#include <sys/kthread.h>
54#include <sys/taskqueue.h>
55#include <sys/endian.h>
56#include <sys/eventhandler.h>
57
58#include <machine/bus.h>
59#include <machine/resource.h>
60#include <sys/rman.h>
61#include <sys/proc.h>
62
63#include <dev/pci/pcivar.h>
64
65#include <cam/cam.h>
66#include <cam/cam_ccb.h>
67#include <cam/scsi/scsi_all.h>
68
69#include <dev/mpr/mpi/mpi2_type.h>
70#include <dev/mpr/mpi/mpi2.h>
71#include <dev/mpr/mpi/mpi2_ioc.h>
72#include <dev/mpr/mpi/mpi2_sas.h>
73#include <dev/mpr/mpi/mpi2_pci.h>
74#include <dev/mpr/mpi/mpi2_cnfg.h>
75#include <dev/mpr/mpi/mpi2_init.h>
76#include <dev/mpr/mpi/mpi2_tool.h>
77#include <dev/mpr/mpr_ioctl.h>
78#include <dev/mpr/mprvar.h>
79#include <dev/mpr/mpr_table.h>
80#include <dev/mpr/mpr_sas.h>
81
82static int mpr_diag_reset(struct mpr_softc *sc, int sleep_flag);
83static int mpr_init_queues(struct mpr_softc *sc);
84static int mpr_message_unit_reset(struct mpr_softc *sc, int sleep_flag);
85static int mpr_transition_operational(struct mpr_softc *sc);
86static int mpr_iocfacts_allocate(struct mpr_softc *sc, uint8_t attaching);
87static void mpr_iocfacts_free(struct mpr_softc *sc);
88static void mpr_startup(void *arg);
89static int mpr_send_iocinit(struct mpr_softc *sc);
90static int mpr_alloc_queues(struct mpr_softc *sc);
91static int mpr_alloc_replies(struct mpr_softc *sc);
92static int mpr_alloc_requests(struct mpr_softc *sc);
93static int mpr_alloc_nvme_prp_pages(struct mpr_softc *sc);
94static int mpr_attach_log(struct mpr_softc *sc);
95static __inline void mpr_complete_command(struct mpr_softc *sc,
96    struct mpr_command *cm);
97static void mpr_dispatch_event(struct mpr_softc *sc, uintptr_t data,
98    MPI2_EVENT_NOTIFICATION_REPLY *reply);
99static void mpr_config_complete(struct mpr_softc *sc, struct mpr_command *cm);
100static void mpr_periodic(void *);
101static int mpr_reregister_events(struct mpr_softc *sc);
102static void mpr_enqueue_request(struct mpr_softc *sc, struct mpr_command *cm);
103static int mpr_get_iocfacts(struct mpr_softc *sc, MPI2_IOC_FACTS_REPLY *facts);
104static int mpr_wait_db_ack(struct mpr_softc *sc, int timeout, int sleep_flag);
105SYSCTL_NODE(_hw, OID_AUTO, mpr, CTLFLAG_RD, 0, "MPR Driver Parameters");
106
107MALLOC_DEFINE(M_MPR, "mpr", "mpr driver memory");
108
109/*
110 * Do a "Diagnostic Reset" aka a hard reset.  This should get the chip out of
111 * any state and back to its initialization state machine.
112 */
113static char mpt2_reset_magic[] = { 0x00, 0x0f, 0x04, 0x0b, 0x02, 0x07, 0x0d };
114
115/*
116 * Added this union to smoothly convert le64toh cm->cm_desc.Words.
117 * Compiler only supports uint64_t to be passed as an argument.
118 * Otherwise it will through this error:
119 * "aggregate value used where an integer was expected"
120 */
121typedef union _reply_descriptor {
122        u64 word;
123        struct {
124                u32 low;
125                u32 high;
126        } u;
127} reply_descriptor, request_descriptor;
128
129/* Rate limit chain-fail messages to 1 per minute */
130static struct timeval mpr_chainfail_interval = { 60, 0 };
131
132/*
133 * sleep_flag can be either CAN_SLEEP or NO_SLEEP.
134 * If this function is called from process context, it can sleep
135 * and there is no harm to sleep, in case if this fuction is called
136 * from Interrupt handler, we can not sleep and need NO_SLEEP flag set.
137 * based on sleep flags driver will call either msleep, pause or DELAY.
138 * msleep and pause are of same variant, but pause is used when mpr_mtx
139 * is not hold by driver.
140 */
141static int
142mpr_diag_reset(struct mpr_softc *sc,int sleep_flag)
143{
144	uint32_t reg;
145	int i, error, tries = 0;
146	uint8_t first_wait_done = FALSE;
147
148	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
149
150	/* Clear any pending interrupts */
151	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
152
153	/*
154	 * Force NO_SLEEP for threads prohibited to sleep
155 	 * e.a Thread from interrupt handler are prohibited to sleep.
156 	 */
157#if __FreeBSD_version >= 1000029
158	if (curthread->td_no_sleeping)
159#else //__FreeBSD_version < 1000029
160	if (curthread->td_pflags & TDP_NOSLEEPING)
161#endif //__FreeBSD_version >= 1000029
162		sleep_flag = NO_SLEEP;
163
164	/* Push the magic sequence */
165	error = ETIMEDOUT;
166	while (tries++ < 20) {
167		for (i = 0; i < sizeof(mpt2_reset_magic); i++)
168			mpr_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET,
169			    mpt2_reset_magic[i]);
170
171		/* wait 100 msec */
172		if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
173			msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
174			    "mprdiag", hz/10);
175		else if (sleep_flag == CAN_SLEEP)
176			pause("mprdiag", hz/10);
177		else
178			DELAY(100 * 1000);
179
180		reg = mpr_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
181		if (reg & MPI2_DIAG_DIAG_WRITE_ENABLE) {
182			error = 0;
183			break;
184		}
185	}
186	if (error)
187		return (error);
188
189	/* Send the actual reset.  XXX need to refresh the reg? */
190	mpr_regwrite(sc, MPI2_HOST_DIAGNOSTIC_OFFSET,
191	    reg | MPI2_DIAG_RESET_ADAPTER);
192
193	/* Wait up to 300 seconds in 50ms intervals */
194	error = ETIMEDOUT;
195	for (i = 0; i < 6000; i++) {
196		/*
197		 * Wait 50 msec. If this is the first time through, wait 256
198		 * msec to satisfy Diag Reset timing requirements.
199		 */
200		if (first_wait_done) {
201			if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
202				msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
203				    "mprdiag", hz/20);
204			else if (sleep_flag == CAN_SLEEP)
205				pause("mprdiag", hz/20);
206			else
207				DELAY(50 * 1000);
208		} else {
209			DELAY(256 * 1000);
210			first_wait_done = TRUE;
211		}
212		/*
213		 * Check for the RESET_ADAPTER bit to be cleared first, then
214		 * wait for the RESET state to be cleared, which takes a little
215		 * longer.
216		 */
217		reg = mpr_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
218		if (reg & MPI2_DIAG_RESET_ADAPTER) {
219			continue;
220		}
221		reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
222		if ((reg & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_RESET) {
223			error = 0;
224			break;
225		}
226	}
227	if (error)
228		return (error);
229
230	mpr_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET, 0x0);
231
232	return (0);
233}
234
235static int
236mpr_message_unit_reset(struct mpr_softc *sc, int sleep_flag)
237{
238
239	MPR_FUNCTRACE(sc);
240
241	mpr_regwrite(sc, MPI2_DOORBELL_OFFSET,
242	    MPI2_FUNCTION_IOC_MESSAGE_UNIT_RESET <<
243	    MPI2_DOORBELL_FUNCTION_SHIFT);
244
245	if (mpr_wait_db_ack(sc, 5, sleep_flag) != 0) {
246		mpr_dprint(sc, MPR_FAULT, "Doorbell handshake failed : <%s>\n",
247				__func__);
248		return (ETIMEDOUT);
249	}
250
251	return (0);
252}
253
254static int
255mpr_transition_ready(struct mpr_softc *sc)
256{
257	uint32_t reg, state;
258	int error, tries = 0;
259	int sleep_flags;
260
261	MPR_FUNCTRACE(sc);
262	/* If we are in attach call, do not sleep */
263	sleep_flags = (sc->mpr_flags & MPR_FLAGS_ATTACH_DONE)
264	    ? CAN_SLEEP : NO_SLEEP;
265
266	error = 0;
267	while (tries++ < 1200) {
268		reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
269		mpr_dprint(sc, MPR_INIT, "Doorbell= 0x%x\n", reg);
270
271		/*
272		 * Ensure the IOC is ready to talk.  If it's not, try
273		 * resetting it.
274		 */
275		if (reg & MPI2_DOORBELL_USED) {
276			mpr_diag_reset(sc, sleep_flags);
277			DELAY(50000);
278			continue;
279		}
280
281		/* Is the adapter owned by another peer? */
282		if ((reg & MPI2_DOORBELL_WHO_INIT_MASK) ==
283		    (MPI2_WHOINIT_PCI_PEER << MPI2_DOORBELL_WHO_INIT_SHIFT)) {
284			device_printf(sc->mpr_dev, "IOC is under the control "
285			    "of another peer host, aborting initialization.\n");
286			return (ENXIO);
287		}
288
289		state = reg & MPI2_IOC_STATE_MASK;
290		if (state == MPI2_IOC_STATE_READY) {
291			/* Ready to go! */
292			error = 0;
293			break;
294		} else if (state == MPI2_IOC_STATE_FAULT) {
295			mpr_dprint(sc, MPR_FAULT, "IOC in fault state 0x%x\n",
296			    state & MPI2_DOORBELL_FAULT_CODE_MASK);
297			mpr_diag_reset(sc, sleep_flags);
298		} else if (state == MPI2_IOC_STATE_OPERATIONAL) {
299			/* Need to take ownership */
300			mpr_message_unit_reset(sc, sleep_flags);
301		} else if (state == MPI2_IOC_STATE_RESET) {
302			/* Wait a bit, IOC might be in transition */
303			mpr_dprint(sc, MPR_FAULT,
304			    "IOC in unexpected reset state\n");
305		} else {
306			mpr_dprint(sc, MPR_FAULT,
307			    "IOC in unknown state 0x%x\n", state);
308			error = EINVAL;
309			break;
310		}
311
312		/* Wait 50ms for things to settle down. */
313		DELAY(50000);
314	}
315
316	if (error)
317		device_printf(sc->mpr_dev, "Cannot transition IOC to ready\n");
318	return (error);
319}
320
321static int
322mpr_transition_operational(struct mpr_softc *sc)
323{
324	uint32_t reg, state;
325	int error;
326
327	MPR_FUNCTRACE(sc);
328
329	error = 0;
330	reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
331	mpr_dprint(sc, MPR_INIT, "Doorbell= 0x%x\n", reg);
332
333	state = reg & MPI2_IOC_STATE_MASK;
334	if (state != MPI2_IOC_STATE_READY) {
335		if ((error = mpr_transition_ready(sc)) != 0) {
336			mpr_dprint(sc, MPR_FAULT,
337			    "%s failed to transition ready\n", __func__);
338			return (error);
339		}
340	}
341
342	error = mpr_send_iocinit(sc);
343	return (error);
344}
345
346/*
347 * This is called during attach and when re-initializing due to a Diag Reset.
348 * IOC Facts is used to allocate many of the structures needed by the driver.
349 * If called from attach, de-allocation is not required because the driver has
350 * not allocated any structures yet, but if called from a Diag Reset, previously
351 * allocated structures based on IOC Facts will need to be freed and re-
352 * allocated bases on the latest IOC Facts.
353 */
354static int
355mpr_iocfacts_allocate(struct mpr_softc *sc, uint8_t attaching)
356{
357	int error;
358	Mpi2IOCFactsReply_t saved_facts;
359	uint8_t saved_mode, reallocating;
360
361	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
362
363	/* Save old IOC Facts and then only reallocate if Facts have changed */
364	if (!attaching) {
365		bcopy(sc->facts, &saved_facts, sizeof(MPI2_IOC_FACTS_REPLY));
366	}
367
368	/*
369	 * Get IOC Facts.  In all cases throughout this function, panic if doing
370	 * a re-initialization and only return the error if attaching so the OS
371	 * can handle it.
372	 */
373	if ((error = mpr_get_iocfacts(sc, sc->facts)) != 0) {
374		if (attaching) {
375			mpr_dprint(sc, MPR_FAULT, "%s failed to get IOC Facts "
376			    "with error %d\n", __func__, error);
377			return (error);
378		} else {
379			panic("%s failed to get IOC Facts with error %d\n",
380			    __func__, error);
381		}
382	}
383
384	MPR_DPRINT_PAGE(sc, MPR_XINFO, iocfacts, sc->facts);
385
386	snprintf(sc->fw_version, sizeof(sc->fw_version),
387	    "%02d.%02d.%02d.%02d",
388	    sc->facts->FWVersion.Struct.Major,
389	    sc->facts->FWVersion.Struct.Minor,
390	    sc->facts->FWVersion.Struct.Unit,
391	    sc->facts->FWVersion.Struct.Dev);
392
393	mpr_printf(sc, "Firmware: %s, Driver: %s\n", sc->fw_version,
394	    MPR_DRIVER_VERSION);
395	mpr_printf(sc, "IOCCapabilities: %b\n", sc->facts->IOCCapabilities,
396	    "\20" "\3ScsiTaskFull" "\4DiagTrace" "\5SnapBuf" "\6ExtBuf"
397	    "\7EEDP" "\10BiDirTarg" "\11Multicast" "\14TransRetry" "\15IR"
398	    "\16EventReplay" "\17RaidAccel" "\20MSIXIndex" "\21HostDisc"
399	    "\22FastPath" "\23RDPQArray" "\24AtomicReqDesc" "\25PCIeSRIOV");
400
401	/*
402	 * If the chip doesn't support event replay then a hard reset will be
403	 * required to trigger a full discovery.  Do the reset here then
404	 * retransition to Ready.  A hard reset might have already been done,
405	 * but it doesn't hurt to do it again.  Only do this if attaching, not
406	 * for a Diag Reset.
407	 */
408	if (attaching) {
409		if ((sc->facts->IOCCapabilities &
410		    MPI2_IOCFACTS_CAPABILITY_EVENT_REPLAY) == 0) {
411			mpr_diag_reset(sc, NO_SLEEP);
412			if ((error = mpr_transition_ready(sc)) != 0) {
413				mpr_dprint(sc, MPR_FAULT, "%s failed to "
414				    "transition to ready with error %d\n",
415				    __func__, error);
416				return (error);
417			}
418		}
419	}
420
421	/*
422	 * Set flag if IR Firmware is loaded.  If the RAID Capability has
423	 * changed from the previous IOC Facts, log a warning, but only if
424	 * checking this after a Diag Reset and not during attach.
425	 */
426	saved_mode = sc->ir_firmware;
427	if (sc->facts->IOCCapabilities &
428	    MPI2_IOCFACTS_CAPABILITY_INTEGRATED_RAID)
429		sc->ir_firmware = 1;
430	if (!attaching) {
431		if (sc->ir_firmware != saved_mode) {
432			mpr_dprint(sc, MPR_FAULT, "%s new IR/IT mode in IOC "
433			    "Facts does not match previous mode\n", __func__);
434		}
435	}
436
437	/* Only deallocate and reallocate if relevant IOC Facts have changed */
438	reallocating = FALSE;
439	sc->mpr_flags &= ~MPR_FLAGS_REALLOCATED;
440
441	if ((!attaching) &&
442	    ((saved_facts.MsgVersion != sc->facts->MsgVersion) ||
443	    (saved_facts.HeaderVersion != sc->facts->HeaderVersion) ||
444	    (saved_facts.MaxChainDepth != sc->facts->MaxChainDepth) ||
445	    (saved_facts.RequestCredit != sc->facts->RequestCredit) ||
446	    (saved_facts.ProductID != sc->facts->ProductID) ||
447	    (saved_facts.IOCCapabilities != sc->facts->IOCCapabilities) ||
448	    (saved_facts.IOCRequestFrameSize !=
449	    sc->facts->IOCRequestFrameSize) ||
450	    (saved_facts.IOCMaxChainSegmentSize !=
451	    sc->facts->IOCMaxChainSegmentSize) ||
452	    (saved_facts.MaxTargets != sc->facts->MaxTargets) ||
453	    (saved_facts.MaxSasExpanders != sc->facts->MaxSasExpanders) ||
454	    (saved_facts.MaxEnclosures != sc->facts->MaxEnclosures) ||
455	    (saved_facts.HighPriorityCredit != sc->facts->HighPriorityCredit) ||
456	    (saved_facts.MaxReplyDescriptorPostQueueDepth !=
457	    sc->facts->MaxReplyDescriptorPostQueueDepth) ||
458	    (saved_facts.ReplyFrameSize != sc->facts->ReplyFrameSize) ||
459	    (saved_facts.MaxVolumes != sc->facts->MaxVolumes) ||
460	    (saved_facts.MaxPersistentEntries !=
461	    sc->facts->MaxPersistentEntries))) {
462		reallocating = TRUE;
463
464		/* Record that we reallocated everything */
465		sc->mpr_flags |= MPR_FLAGS_REALLOCATED;
466	}
467
468	/*
469	 * Some things should be done if attaching or re-allocating after a Diag
470	 * Reset, but are not needed after a Diag Reset if the FW has not
471	 * changed.
472	 */
473	if (attaching || reallocating) {
474		/*
475		 * Check if controller supports FW diag buffers and set flag to
476		 * enable each type.
477		 */
478		if (sc->facts->IOCCapabilities &
479		    MPI2_IOCFACTS_CAPABILITY_DIAG_TRACE_BUFFER)
480			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_TRACE].
481			    enabled = TRUE;
482		if (sc->facts->IOCCapabilities &
483		    MPI2_IOCFACTS_CAPABILITY_SNAPSHOT_BUFFER)
484			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_SNAPSHOT].
485			    enabled = TRUE;
486		if (sc->facts->IOCCapabilities &
487		    MPI2_IOCFACTS_CAPABILITY_EXTENDED_BUFFER)
488			sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_EXTENDED].
489			    enabled = TRUE;
490
491		/*
492		 * Set flags for some supported items.
493		 */
494		if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EEDP)
495			sc->eedp_enabled = TRUE;
496		if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_TLR)
497			sc->control_TLR = TRUE;
498		if (sc->facts->IOCCapabilities &
499		    MPI26_IOCFACTS_CAPABILITY_ATOMIC_REQ)
500			sc->atomic_desc_capable = TRUE;
501
502		/*
503		 * Size the queues. Since the reply queues always need one free
504		 * entry, we'll just deduct one reply message here.
505		 */
506		sc->num_prireqs = MIN(MPR_PRI_REQ_FRAMES,
507		    sc->facts->HighPriorityCredit);
508		sc->num_reqs = MIN(MPR_REQ_FRAMES, sc->facts->RequestCredit) +
509		    sc->num_prireqs;
510		sc->num_replies = MIN(MPR_REPLY_FRAMES + MPR_EVT_REPLY_FRAMES,
511		    sc->facts->MaxReplyDescriptorPostQueueDepth) - 1;
512
513		/*
514		 * Initialize all Tail Queues
515		 */
516		TAILQ_INIT(&sc->req_list);
517		TAILQ_INIT(&sc->high_priority_req_list);
518		TAILQ_INIT(&sc->chain_list);
519		TAILQ_INIT(&sc->prp_page_list);
520		TAILQ_INIT(&sc->tm_list);
521	}
522
523	/*
524	 * If doing a Diag Reset and the FW is significantly different
525	 * (reallocating will be set above in IOC Facts comparison), then all
526	 * buffers based on the IOC Facts will need to be freed before they are
527	 * reallocated.
528	 */
529	if (reallocating) {
530		mpr_iocfacts_free(sc);
531		mprsas_realloc_targets(sc, saved_facts.MaxTargets +
532		    saved_facts.MaxVolumes);
533	}
534
535	/*
536	 * Any deallocation has been completed.  Now start reallocating
537	 * if needed.  Will only need to reallocate if attaching or if the new
538	 * IOC Facts are different from the previous IOC Facts after a Diag
539	 * Reset. Targets have already been allocated above if needed.
540	 */
541	if (attaching || reallocating) {
542		if (((error = mpr_alloc_queues(sc)) != 0) ||
543		    ((error = mpr_alloc_replies(sc)) != 0) ||
544		    ((error = mpr_alloc_requests(sc)) != 0)) {
545			if (attaching ) {
546				mpr_dprint(sc, MPR_FAULT, "%s failed to alloc "
547				    "queues with error %d\n", __func__, error);
548				mpr_free(sc);
549				return (error);
550			} else {
551				panic("%s failed to alloc queues with error "
552				    "%d\n", __func__, error);
553			}
554		}
555	}
556
557	/* Always initialize the queues */
558	bzero(sc->free_queue, sc->fqdepth * 4);
559	mpr_init_queues(sc);
560
561	/*
562	 * Always get the chip out of the reset state, but only panic if not
563	 * attaching.  If attaching and there is an error, that is handled by
564	 * the OS.
565	 */
566	error = mpr_transition_operational(sc);
567	if (error != 0) {
568		if (attaching) {
569			mpr_printf(sc, "%s failed to transition to operational "
570			    "with error %d\n", __func__, error);
571			mpr_free(sc);
572			return (error);
573		} else {
574			panic("%s failed to transition to operational with "
575			    "error %d\n", __func__, error);
576		}
577	}
578
579	/*
580	 * Finish the queue initialization.
581	 * These are set here instead of in mpr_init_queues() because the
582	 * IOC resets these values during the state transition in
583	 * mpr_transition_operational().  The free index is set to 1
584	 * because the corresponding index in the IOC is set to 0, and the
585	 * IOC treats the queues as full if both are set to the same value.
586	 * Hence the reason that the queue can't hold all of the possible
587	 * replies.
588	 */
589	sc->replypostindex = 0;
590	mpr_regwrite(sc, MPI2_REPLY_FREE_HOST_INDEX_OFFSET, sc->replyfreeindex);
591	mpr_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, 0);
592
593	/*
594	 * Attach the subsystems so they can prepare their event masks.
595	 */
596	/* XXX Should be dynamic so that IM/IR and user modules can attach */
597	if (attaching) {
598		if (((error = mpr_attach_log(sc)) != 0) ||
599		    ((error = mpr_attach_sas(sc)) != 0) ||
600		    ((error = mpr_attach_user(sc)) != 0)) {
601			mpr_printf(sc, "%s failed to attach all subsystems: "
602			    "error %d\n", __func__, error);
603			mpr_free(sc);
604			return (error);
605		}
606
607		if ((error = mpr_pci_setup_interrupts(sc)) != 0) {
608			mpr_printf(sc, "%s failed to setup interrupts\n",
609			    __func__);
610			mpr_free(sc);
611			return (error);
612		}
613	}
614
615	return (error);
616}
617
618/*
619 * This is called if memory is being free (during detach for example) and when
620 * buffers need to be reallocated due to a Diag Reset.
621 */
622static void
623mpr_iocfacts_free(struct mpr_softc *sc)
624{
625	struct mpr_command *cm;
626	int i;
627
628	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
629
630	if (sc->free_busaddr != 0)
631		bus_dmamap_unload(sc->queues_dmat, sc->queues_map);
632	if (sc->free_queue != NULL)
633		bus_dmamem_free(sc->queues_dmat, sc->free_queue,
634		    sc->queues_map);
635	if (sc->queues_dmat != NULL)
636		bus_dma_tag_destroy(sc->queues_dmat);
637
638	if (sc->chain_busaddr != 0)
639		bus_dmamap_unload(sc->chain_dmat, sc->chain_map);
640	if (sc->chain_frames != NULL)
641		bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
642		    sc->chain_map);
643	if (sc->chain_dmat != NULL)
644		bus_dma_tag_destroy(sc->chain_dmat);
645
646	if (sc->sense_busaddr != 0)
647		bus_dmamap_unload(sc->sense_dmat, sc->sense_map);
648	if (sc->sense_frames != NULL)
649		bus_dmamem_free(sc->sense_dmat, sc->sense_frames,
650		    sc->sense_map);
651	if (sc->sense_dmat != NULL)
652		bus_dma_tag_destroy(sc->sense_dmat);
653
654	if (sc->prp_page_busaddr != 0)
655		bus_dmamap_unload(sc->prp_page_dmat, sc->prp_page_map);
656	if (sc->prp_pages != NULL)
657		bus_dmamem_free(sc->prp_page_dmat, sc->prp_pages,
658		    sc->prp_page_map);
659	if (sc->prp_page_dmat != NULL)
660		bus_dma_tag_destroy(sc->prp_page_dmat);
661
662	if (sc->reply_busaddr != 0)
663		bus_dmamap_unload(sc->reply_dmat, sc->reply_map);
664	if (sc->reply_frames != NULL)
665		bus_dmamem_free(sc->reply_dmat, sc->reply_frames,
666		    sc->reply_map);
667	if (sc->reply_dmat != NULL)
668		bus_dma_tag_destroy(sc->reply_dmat);
669
670	if (sc->req_busaddr != 0)
671		bus_dmamap_unload(sc->req_dmat, sc->req_map);
672	if (sc->req_frames != NULL)
673		bus_dmamem_free(sc->req_dmat, sc->req_frames, sc->req_map);
674	if (sc->req_dmat != NULL)
675		bus_dma_tag_destroy(sc->req_dmat);
676
677	if (sc->chains != NULL)
678		free(sc->chains, M_MPR);
679	if (sc->prps != NULL)
680		free(sc->prps, M_MPR);
681	if (sc->commands != NULL) {
682		for (i = 1; i < sc->num_reqs; i++) {
683			cm = &sc->commands[i];
684			bus_dmamap_destroy(sc->buffer_dmat, cm->cm_dmamap);
685		}
686		free(sc->commands, M_MPR);
687	}
688	if (sc->buffer_dmat != NULL)
689		bus_dma_tag_destroy(sc->buffer_dmat);
690}
691
692/*
693 * The terms diag reset and hard reset are used interchangeably in the MPI
694 * docs to mean resetting the controller chip.  In this code diag reset
695 * cleans everything up, and the hard reset function just sends the reset
696 * sequence to the chip.  This should probably be refactored so that every
697 * subsystem gets a reset notification of some sort, and can clean up
698 * appropriately.
699 */
700int
701mpr_reinit(struct mpr_softc *sc)
702{
703	int error;
704	struct mprsas_softc *sassc;
705
706	sassc = sc->sassc;
707
708	MPR_FUNCTRACE(sc);
709
710	mtx_assert(&sc->mpr_mtx, MA_OWNED);
711
712	if (sc->mpr_flags & MPR_FLAGS_DIAGRESET) {
713		mpr_dprint(sc, MPR_INIT, "%s reset already in progress\n",
714		    __func__);
715		return 0;
716	}
717
718	mpr_dprint(sc, MPR_INFO, "Reinitializing controller,\n");
719	/* make sure the completion callbacks can recognize they're getting
720	 * a NULL cm_reply due to a reset.
721	 */
722	sc->mpr_flags |= MPR_FLAGS_DIAGRESET;
723
724	/*
725	 * Mask interrupts here.
726	 */
727	mpr_dprint(sc, MPR_INIT, "%s mask interrupts\n", __func__);
728	mpr_mask_intr(sc);
729
730	error = mpr_diag_reset(sc, CAN_SLEEP);
731	if (error != 0) {
732		panic("%s hard reset failed with error %d\n", __func__, error);
733	}
734
735	/* Restore the PCI state, including the MSI-X registers */
736	mpr_pci_restore(sc);
737
738	/* Give the I/O subsystem special priority to get itself prepared */
739	mprsas_handle_reinit(sc);
740
741	/*
742	 * Get IOC Facts and allocate all structures based on this information.
743	 * The attach function will also call mpr_iocfacts_allocate at startup.
744	 * If relevant values have changed in IOC Facts, this function will free
745	 * all of the memory based on IOC Facts and reallocate that memory.
746	 */
747	if ((error = mpr_iocfacts_allocate(sc, FALSE)) != 0) {
748		panic("%s IOC Facts based allocation failed with error %d\n",
749		    __func__, error);
750	}
751
752	/*
753	 * Mapping structures will be re-allocated after getting IOC Page8, so
754	 * free these structures here.
755	 */
756	mpr_mapping_exit(sc);
757
758	/*
759	 * The static page function currently read is IOC Page8.  Others can be
760	 * added in future.  It's possible that the values in IOC Page8 have
761	 * changed after a Diag Reset due to user modification, so always read
762	 * these.  Interrupts are masked, so unmask them before getting config
763	 * pages.
764	 */
765	mpr_unmask_intr(sc);
766	sc->mpr_flags &= ~MPR_FLAGS_DIAGRESET;
767	mpr_base_static_config_pages(sc);
768
769	/*
770	 * Some mapping info is based in IOC Page8 data, so re-initialize the
771	 * mapping tables.
772	 */
773	mpr_mapping_initialize(sc);
774
775	/*
776	 * Restart will reload the event masks clobbered by the reset, and
777	 * then enable the port.
778	 */
779	mpr_reregister_events(sc);
780
781	/* the end of discovery will release the simq, so we're done. */
782	mpr_dprint(sc, MPR_INFO, "%s finished sc %p post %u free %u\n",
783	    __func__, sc, sc->replypostindex, sc->replyfreeindex);
784	mprsas_release_simq_reinit(sassc);
785
786	return 0;
787}
788
789/* Wait for the chip to ACK a word that we've put into its FIFO
790 * Wait for <timeout> seconds. In single loop wait for busy loop
791 * for 500 microseconds.
792 * Total is [ 0.5 * (2000 * <timeout>) ] in miliseconds.
793 * */
794static int
795mpr_wait_db_ack(struct mpr_softc *sc, int timeout, int sleep_flag)
796{
797	u32 cntdn, count;
798	u32 int_status;
799	u32 doorbell;
800
801	count = 0;
802	cntdn = (sleep_flag == CAN_SLEEP) ? 1000*timeout : 2000*timeout;
803	do {
804		int_status = mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
805		if (!(int_status & MPI2_HIS_SYS2IOC_DB_STATUS)) {
806			mpr_dprint(sc, MPR_INIT, "%s: successful count(%d), "
807			    "timeout(%d)\n", __func__, count, timeout);
808			return 0;
809		} else if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
810			doorbell = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
811			if ((doorbell & MPI2_IOC_STATE_MASK) ==
812			    MPI2_IOC_STATE_FAULT) {
813				mpr_dprint(sc, MPR_FAULT,
814				    "fault_state(0x%04x)!\n", doorbell);
815				return (EFAULT);
816			}
817		} else if (int_status == 0xFFFFFFFF)
818			goto out;
819
820		/*
821		 * If it can sleep, sleep for 1 milisecond, else busy loop for
822 		 * 0.5 milisecond
823		 */
824		if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
825			msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0, "mprdba",
826			    hz/1000);
827		else if (sleep_flag == CAN_SLEEP)
828			pause("mprdba", hz/1000);
829		else
830			DELAY(500);
831		count++;
832	} while (--cntdn);
833
834out:
835	mpr_dprint(sc, MPR_FAULT, "%s: failed due to timeout count(%d), "
836		"int_status(%x)!\n", __func__, count, int_status);
837	return (ETIMEDOUT);
838}
839
840/* Wait for the chip to signal that the next word in its FIFO can be fetched */
841static int
842mpr_wait_db_int(struct mpr_softc *sc)
843{
844	int retry;
845
846	for (retry = 0; retry < MPR_DB_MAX_WAIT; retry++) {
847		if ((mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET) &
848		    MPI2_HIS_IOC2SYS_DB_STATUS) != 0)
849			return (0);
850		DELAY(2000);
851	}
852	return (ETIMEDOUT);
853}
854
855/* Step through the synchronous command state machine, i.e. "Doorbell mode" */
856static int
857mpr_request_sync(struct mpr_softc *sc, void *req, MPI2_DEFAULT_REPLY *reply,
858    int req_sz, int reply_sz, int timeout)
859{
860	uint32_t *data32;
861	uint16_t *data16;
862	int i, count, ioc_sz, residual;
863	int sleep_flags = CAN_SLEEP;
864
865#if __FreeBSD_version >= 1000029
866	if (curthread->td_no_sleeping)
867#else //__FreeBSD_version < 1000029
868	if (curthread->td_pflags & TDP_NOSLEEPING)
869#endif //__FreeBSD_version >= 1000029
870		sleep_flags = NO_SLEEP;
871
872	/* Step 1 */
873	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
874
875	/* Step 2 */
876	if (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
877		return (EBUSY);
878
879	/* Step 3
880	 * Announce that a message is coming through the doorbell.  Messages
881	 * are pushed at 32bit words, so round up if needed.
882	 */
883	count = (req_sz + 3) / 4;
884	mpr_regwrite(sc, MPI2_DOORBELL_OFFSET,
885	    (MPI2_FUNCTION_HANDSHAKE << MPI2_DOORBELL_FUNCTION_SHIFT) |
886	    (count << MPI2_DOORBELL_ADD_DWORDS_SHIFT));
887
888	/* Step 4 */
889	if (mpr_wait_db_int(sc) ||
890	    (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) == 0) {
891		mpr_dprint(sc, MPR_FAULT, "Doorbell failed to activate\n");
892		return (ENXIO);
893	}
894	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
895	if (mpr_wait_db_ack(sc, 5, sleep_flags) != 0) {
896		mpr_dprint(sc, MPR_FAULT, "Doorbell handshake failed\n");
897		return (ENXIO);
898	}
899
900	/* Step 5 */
901	/* Clock out the message data synchronously in 32-bit dwords*/
902	data32 = (uint32_t *)req;
903	for (i = 0; i < count; i++) {
904		mpr_regwrite(sc, MPI2_DOORBELL_OFFSET, htole32(data32[i]));
905		if (mpr_wait_db_ack(sc, 5, sleep_flags) != 0) {
906			mpr_dprint(sc, MPR_FAULT,
907			    "Timeout while writing doorbell\n");
908			return (ENXIO);
909		}
910	}
911
912	/* Step 6 */
913	/* Clock in the reply in 16-bit words.  The total length of the
914	 * message is always in the 4th byte, so clock out the first 2 words
915	 * manually, then loop the rest.
916	 */
917	data16 = (uint16_t *)reply;
918	if (mpr_wait_db_int(sc) != 0) {
919		mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell 0\n");
920		return (ENXIO);
921	}
922	data16[0] =
923	    mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
924	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
925	if (mpr_wait_db_int(sc) != 0) {
926		mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell 1\n");
927		return (ENXIO);
928	}
929	data16[1] =
930	    mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_DATA_MASK;
931	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
932
933	/* Number of 32bit words in the message */
934	ioc_sz = reply->MsgLength;
935
936	/*
937	 * Figure out how many 16bit words to clock in without overrunning.
938	 * The precision loss with dividing reply_sz can safely be
939	 * ignored because the messages can only be multiples of 32bits.
940	 */
941	residual = 0;
942	count = MIN((reply_sz / 4), ioc_sz) * 2;
943	if (count < ioc_sz * 2) {
944		residual = ioc_sz * 2 - count;
945		mpr_dprint(sc, MPR_ERROR, "Driver error, throwing away %d "
946		    "residual message words\n", residual);
947	}
948
949	for (i = 2; i < count; i++) {
950		if (mpr_wait_db_int(sc) != 0) {
951			mpr_dprint(sc, MPR_FAULT,
952			    "Timeout reading doorbell %d\n", i);
953			return (ENXIO);
954		}
955		data16[i] = mpr_regread(sc, MPI2_DOORBELL_OFFSET) &
956		    MPI2_DOORBELL_DATA_MASK;
957		mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
958	}
959
960	/*
961	 * Pull out residual words that won't fit into the provided buffer.
962	 * This keeps the chip from hanging due to a driver programming
963	 * error.
964	 */
965	while (residual--) {
966		if (mpr_wait_db_int(sc) != 0) {
967			mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell\n");
968			return (ENXIO);
969		}
970		(void)mpr_regread(sc, MPI2_DOORBELL_OFFSET);
971		mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
972	}
973
974	/* Step 7 */
975	if (mpr_wait_db_int(sc) != 0) {
976		mpr_dprint(sc, MPR_FAULT, "Timeout waiting to exit doorbell\n");
977		return (ENXIO);
978	}
979	if (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
980		mpr_dprint(sc, MPR_FAULT, "Warning, doorbell still active\n");
981	mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
982
983	return (0);
984}
985
986static void
987mpr_enqueue_request(struct mpr_softc *sc, struct mpr_command *cm)
988{
989	request_descriptor rd;
990
991	MPR_FUNCTRACE(sc);
992	mpr_dprint(sc, MPR_TRACE, "SMID %u cm %p ccb %p\n",
993	    cm->cm_desc.Default.SMID, cm, cm->cm_ccb);
994
995	if (sc->mpr_flags & MPR_FLAGS_ATTACH_DONE && !(sc->mpr_flags &
996	    MPR_FLAGS_SHUTDOWN))
997		mtx_assert(&sc->mpr_mtx, MA_OWNED);
998
999	if (++sc->io_cmds_active > sc->io_cmds_highwater)
1000		sc->io_cmds_highwater++;
1001
1002	if (sc->atomic_desc_capable) {
1003		rd.u.low = cm->cm_desc.Words.Low;
1004		mpr_regwrite(sc, MPI26_ATOMIC_REQUEST_DESCRIPTOR_POST_OFFSET,
1005		    rd.u.low);
1006	} else {
1007		rd.u.low = cm->cm_desc.Words.Low;
1008		rd.u.high = cm->cm_desc.Words.High;
1009		rd.word = htole64(rd.word);
1010		mpr_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_LOW_OFFSET,
1011		    rd.u.low);
1012		mpr_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_HIGH_OFFSET,
1013		    rd.u.high);
1014	}
1015}
1016
1017/*
1018 * Just the FACTS, ma'am.
1019 */
1020static int
1021mpr_get_iocfacts(struct mpr_softc *sc, MPI2_IOC_FACTS_REPLY *facts)
1022{
1023	MPI2_DEFAULT_REPLY *reply;
1024	MPI2_IOC_FACTS_REQUEST request;
1025	int error, req_sz, reply_sz;
1026
1027	MPR_FUNCTRACE(sc);
1028
1029	req_sz = sizeof(MPI2_IOC_FACTS_REQUEST);
1030	reply_sz = sizeof(MPI2_IOC_FACTS_REPLY);
1031	reply = (MPI2_DEFAULT_REPLY *)facts;
1032
1033	bzero(&request, req_sz);
1034	request.Function = MPI2_FUNCTION_IOC_FACTS;
1035	error = mpr_request_sync(sc, &request, reply, req_sz, reply_sz, 5);
1036
1037	return (error);
1038}
1039
1040static int
1041mpr_send_iocinit(struct mpr_softc *sc)
1042{
1043	MPI2_IOC_INIT_REQUEST	init;
1044	MPI2_DEFAULT_REPLY	reply;
1045	int req_sz, reply_sz, error;
1046	struct timeval now;
1047	uint64_t time_in_msec;
1048
1049	MPR_FUNCTRACE(sc);
1050
1051	req_sz = sizeof(MPI2_IOC_INIT_REQUEST);
1052	reply_sz = sizeof(MPI2_IOC_INIT_REPLY);
1053	bzero(&init, req_sz);
1054	bzero(&reply, reply_sz);
1055
1056	/*
1057	 * Fill in the init block.  Note that most addresses are
1058	 * deliberately in the lower 32bits of memory.  This is a micro-
1059	 * optimzation for PCI/PCIX, though it's not clear if it helps PCIe.
1060	 */
1061	init.Function = MPI2_FUNCTION_IOC_INIT;
1062	init.WhoInit = MPI2_WHOINIT_HOST_DRIVER;
1063	init.MsgVersion = htole16(MPI2_VERSION);
1064	init.HeaderVersion = htole16(MPI2_HEADER_VERSION);
1065	init.SystemRequestFrameSize = htole16(sc->facts->IOCRequestFrameSize);
1066	init.ReplyDescriptorPostQueueDepth = htole16(sc->pqdepth);
1067	init.ReplyFreeQueueDepth = htole16(sc->fqdepth);
1068	init.SenseBufferAddressHigh = 0;
1069	init.SystemReplyAddressHigh = 0;
1070	init.SystemRequestFrameBaseAddress.High = 0;
1071	init.SystemRequestFrameBaseAddress.Low =
1072	    htole32((uint32_t)sc->req_busaddr);
1073	init.ReplyDescriptorPostQueueAddress.High = 0;
1074	init.ReplyDescriptorPostQueueAddress.Low =
1075	    htole32((uint32_t)sc->post_busaddr);
1076	init.ReplyFreeQueueAddress.High = 0;
1077	init.ReplyFreeQueueAddress.Low = htole32((uint32_t)sc->free_busaddr);
1078	getmicrotime(&now);
1079	time_in_msec = (now.tv_sec * 1000 + now.tv_usec/1000);
1080	init.TimeStamp.High = htole32((time_in_msec >> 32) & 0xFFFFFFFF);
1081	init.TimeStamp.Low = htole32(time_in_msec & 0xFFFFFFFF);
1082	init.HostPageSize = HOST_PAGE_SIZE_4K;
1083
1084	error = mpr_request_sync(sc, &init, &reply, req_sz, reply_sz, 5);
1085	if ((reply.IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
1086		error = ENXIO;
1087
1088	mpr_dprint(sc, MPR_INIT, "IOCInit status= 0x%x\n", reply.IOCStatus);
1089	return (error);
1090}
1091
1092void
1093mpr_memaddr_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
1094{
1095	bus_addr_t *addr;
1096
1097	addr = arg;
1098	*addr = segs[0].ds_addr;
1099}
1100
1101static int
1102mpr_alloc_queues(struct mpr_softc *sc)
1103{
1104	bus_addr_t queues_busaddr;
1105	uint8_t *queues;
1106	int qsize, fqsize, pqsize;
1107
1108	/*
1109	 * The reply free queue contains 4 byte entries in multiples of 16 and
1110	 * aligned on a 16 byte boundary. There must always be an unused entry.
1111	 * This queue supplies fresh reply frames for the firmware to use.
1112	 *
1113	 * The reply descriptor post queue contains 8 byte entries in
1114	 * multiples of 16 and aligned on a 16 byte boundary.  This queue
1115	 * contains filled-in reply frames sent from the firmware to the host.
1116	 *
1117	 * These two queues are allocated together for simplicity.
1118	 */
1119	sc->fqdepth = roundup2(sc->num_replies + 1, 16);
1120	sc->pqdepth = roundup2(sc->num_replies + 1, 16);
1121	fqsize= sc->fqdepth * 4;
1122	pqsize = sc->pqdepth * 8;
1123	qsize = fqsize + pqsize;
1124
1125        if (bus_dma_tag_create( sc->mpr_parent_dmat,    /* parent */
1126				16, 0,			/* algnmnt, boundary */
1127				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
1128				BUS_SPACE_MAXADDR,	/* highaddr */
1129				NULL, NULL,		/* filter, filterarg */
1130                                qsize,			/* maxsize */
1131                                1,			/* nsegments */
1132                                qsize,			/* maxsegsize */
1133                                0,			/* flags */
1134                                NULL, NULL,		/* lockfunc, lockarg */
1135                                &sc->queues_dmat)) {
1136		device_printf(sc->mpr_dev, "Cannot allocate queues DMA tag\n");
1137		return (ENOMEM);
1138        }
1139        if (bus_dmamem_alloc(sc->queues_dmat, (void **)&queues, BUS_DMA_NOWAIT,
1140	    &sc->queues_map)) {
1141		device_printf(sc->mpr_dev, "Cannot allocate queues memory\n");
1142		return (ENOMEM);
1143        }
1144        bzero(queues, qsize);
1145        bus_dmamap_load(sc->queues_dmat, sc->queues_map, queues, qsize,
1146	    mpr_memaddr_cb, &queues_busaddr, 0);
1147
1148	sc->free_queue = (uint32_t *)queues;
1149	sc->free_busaddr = queues_busaddr;
1150	sc->post_queue = (MPI2_REPLY_DESCRIPTORS_UNION *)(queues + fqsize);
1151	sc->post_busaddr = queues_busaddr + fqsize;
1152
1153	return (0);
1154}
1155
1156static int
1157mpr_alloc_replies(struct mpr_softc *sc)
1158{
1159	int rsize, num_replies;
1160
1161	/*
1162	 * sc->num_replies should be one less than sc->fqdepth.  We need to
1163	 * allocate space for sc->fqdepth replies, but only sc->num_replies
1164	 * replies can be used at once.
1165	 */
1166	num_replies = max(sc->fqdepth, sc->num_replies);
1167
1168	rsize = sc->facts->ReplyFrameSize * num_replies * 4;
1169        if (bus_dma_tag_create( sc->mpr_parent_dmat,    /* parent */
1170				4, 0,			/* algnmnt, boundary */
1171				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
1172				BUS_SPACE_MAXADDR,	/* highaddr */
1173				NULL, NULL,		/* filter, filterarg */
1174                                rsize,			/* maxsize */
1175                                1,			/* nsegments */
1176                                rsize,			/* maxsegsize */
1177                                0,			/* flags */
1178                                NULL, NULL,		/* lockfunc, lockarg */
1179                                &sc->reply_dmat)) {
1180		device_printf(sc->mpr_dev, "Cannot allocate replies DMA tag\n");
1181		return (ENOMEM);
1182        }
1183        if (bus_dmamem_alloc(sc->reply_dmat, (void **)&sc->reply_frames,
1184	    BUS_DMA_NOWAIT, &sc->reply_map)) {
1185		device_printf(sc->mpr_dev, "Cannot allocate replies memory\n");
1186		return (ENOMEM);
1187        }
1188        bzero(sc->reply_frames, rsize);
1189        bus_dmamap_load(sc->reply_dmat, sc->reply_map, sc->reply_frames, rsize,
1190	    mpr_memaddr_cb, &sc->reply_busaddr, 0);
1191
1192	return (0);
1193}
1194
1195static int
1196mpr_alloc_requests(struct mpr_softc *sc)
1197{
1198	struct mpr_command *cm;
1199	struct mpr_chain *chain;
1200	int i, rsize, nsegs;
1201
1202	rsize = sc->facts->IOCRequestFrameSize * sc->num_reqs * 4;
1203        if (bus_dma_tag_create( sc->mpr_parent_dmat,    /* parent */
1204				16, 0,			/* algnmnt, boundary */
1205				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
1206				BUS_SPACE_MAXADDR,	/* highaddr */
1207				NULL, NULL,		/* filter, filterarg */
1208                                rsize,			/* maxsize */
1209                                1,			/* nsegments */
1210                                rsize,			/* maxsegsize */
1211                                0,			/* flags */
1212                                NULL, NULL,		/* lockfunc, lockarg */
1213                                &sc->req_dmat)) {
1214		device_printf(sc->mpr_dev, "Cannot allocate request DMA tag\n");
1215		return (ENOMEM);
1216        }
1217        if (bus_dmamem_alloc(sc->req_dmat, (void **)&sc->req_frames,
1218	    BUS_DMA_NOWAIT, &sc->req_map)) {
1219		device_printf(sc->mpr_dev, "Cannot allocate request memory\n");
1220		return (ENOMEM);
1221        }
1222        bzero(sc->req_frames, rsize);
1223        bus_dmamap_load(sc->req_dmat, sc->req_map, sc->req_frames, rsize,
1224	    mpr_memaddr_cb, &sc->req_busaddr, 0);
1225
1226	/*
1227	 * Gen3 and beyond uses the IOCMaxChainSegmentSize from IOC Facts to
1228	 * get the size of a Chain Frame.  Previous versions use the size as a
1229	 * Request Frame for the Chain Frame size.  If IOCMaxChainSegmentSize
1230	 * is 0, use the default value.  The IOCMaxChainSegmentSize is the
1231	 * number of 16-byte elelements that can fit in a Chain Frame, which is
1232	 * the size of an IEEE Simple SGE.
1233	 */
1234	if (sc->facts->MsgVersion >= MPI2_VERSION_02_05) {
1235		sc->chain_seg_size =
1236		    htole16(sc->facts->IOCMaxChainSegmentSize);
1237		if (sc->chain_seg_size == 0) {
1238			sc->chain_frame_size = MPR_DEFAULT_CHAIN_SEG_SIZE *
1239			    MPR_MAX_CHAIN_ELEMENT_SIZE;
1240		} else {
1241			sc->chain_frame_size = sc->chain_seg_size *
1242			    MPR_MAX_CHAIN_ELEMENT_SIZE;
1243		}
1244	} else {
1245		sc->chain_frame_size = sc->facts->IOCRequestFrameSize * 4;
1246	}
1247	rsize = sc->chain_frame_size * sc->max_chains;
1248        if (bus_dma_tag_create( sc->mpr_parent_dmat,    /* parent */
1249				16, 0,			/* algnmnt, boundary */
1250				BUS_SPACE_MAXADDR,	/* lowaddr */
1251				BUS_SPACE_MAXADDR,	/* highaddr */
1252				NULL, NULL,		/* filter, filterarg */
1253                                rsize,			/* maxsize */
1254                                1,			/* nsegments */
1255                                rsize,			/* maxsegsize */
1256                                0,			/* flags */
1257                                NULL, NULL,		/* lockfunc, lockarg */
1258                                &sc->chain_dmat)) {
1259		device_printf(sc->mpr_dev, "Cannot allocate chain DMA tag\n");
1260		return (ENOMEM);
1261        }
1262        if (bus_dmamem_alloc(sc->chain_dmat, (void **)&sc->chain_frames,
1263	    BUS_DMA_NOWAIT, &sc->chain_map)) {
1264		device_printf(sc->mpr_dev, "Cannot allocate chain memory\n");
1265		return (ENOMEM);
1266        }
1267        bzero(sc->chain_frames, rsize);
1268        bus_dmamap_load(sc->chain_dmat, sc->chain_map, sc->chain_frames, rsize,
1269	    mpr_memaddr_cb, &sc->chain_busaddr, 0);
1270
1271	rsize = MPR_SENSE_LEN * sc->num_reqs;
1272	if (bus_dma_tag_create( sc->mpr_parent_dmat,    /* parent */
1273				1, 0,			/* algnmnt, boundary */
1274				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
1275				BUS_SPACE_MAXADDR,	/* highaddr */
1276				NULL, NULL,		/* filter, filterarg */
1277                                rsize,			/* maxsize */
1278                                1,			/* nsegments */
1279                                rsize,			/* maxsegsize */
1280                                0,			/* flags */
1281                                NULL, NULL,		/* lockfunc, lockarg */
1282                                &sc->sense_dmat)) {
1283		device_printf(sc->mpr_dev, "Cannot allocate sense DMA tag\n");
1284		return (ENOMEM);
1285        }
1286        if (bus_dmamem_alloc(sc->sense_dmat, (void **)&sc->sense_frames,
1287	    BUS_DMA_NOWAIT, &sc->sense_map)) {
1288		device_printf(sc->mpr_dev, "Cannot allocate sense memory\n");
1289		return (ENOMEM);
1290        }
1291        bzero(sc->sense_frames, rsize);
1292        bus_dmamap_load(sc->sense_dmat, sc->sense_map, sc->sense_frames, rsize,
1293	    mpr_memaddr_cb, &sc->sense_busaddr, 0);
1294
1295	sc->chains = malloc(sizeof(struct mpr_chain) * sc->max_chains, M_MPR,
1296	    M_WAITOK | M_ZERO);
1297	if (!sc->chains) {
1298		device_printf(sc->mpr_dev, "Cannot allocate memory %s %d\n",
1299		    __func__, __LINE__);
1300		return (ENOMEM);
1301	}
1302	for (i = 0; i < sc->max_chains; i++) {
1303		chain = &sc->chains[i];
1304		chain->chain = (MPI2_SGE_IO_UNION *)(sc->chain_frames +
1305		    i * sc->chain_frame_size);
1306		chain->chain_busaddr = sc->chain_busaddr +
1307		    i * sc->chain_frame_size;
1308		mpr_free_chain(sc, chain);
1309		sc->chain_free_lowwater++;
1310	}
1311
1312	/*
1313	 * Allocate NVMe PRP Pages for NVMe SGL support only if the FW supports
1314	 * these devices.
1315	 */
1316	if ((sc->facts->MsgVersion >= MPI2_VERSION_02_06) &&
1317	    (sc->facts->ProtocolFlags & MPI2_IOCFACTS_PROTOCOL_NVME_DEVICES)) {
1318		if (mpr_alloc_nvme_prp_pages(sc) == ENOMEM)
1319			return (ENOMEM);
1320	}
1321
1322	/* XXX Need to pick a more precise value */
1323	nsegs = (MAXPHYS / PAGE_SIZE) + 1;
1324        if (bus_dma_tag_create( sc->mpr_parent_dmat,    /* parent */
1325				1, 0,			/* algnmnt, boundary */
1326				BUS_SPACE_MAXADDR,	/* lowaddr */
1327				BUS_SPACE_MAXADDR,	/* highaddr */
1328				NULL, NULL,		/* filter, filterarg */
1329                                BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
1330                                nsegs,			/* nsegments */
1331                                BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
1332                                BUS_DMA_ALLOCNOW,	/* flags */
1333                                busdma_lock_mutex,	/* lockfunc */
1334				&sc->mpr_mtx,		/* lockarg */
1335                                &sc->buffer_dmat)) {
1336		device_printf(sc->mpr_dev, "Cannot allocate buffer DMA tag\n");
1337		return (ENOMEM);
1338        }
1339
1340	/*
1341	 * SMID 0 cannot be used as a free command per the firmware spec.
1342	 * Just drop that command instead of risking accounting bugs.
1343	 */
1344	sc->commands = malloc(sizeof(struct mpr_command) * sc->num_reqs,
1345	    M_MPR, M_WAITOK | M_ZERO);
1346	if (!sc->commands) {
1347		device_printf(sc->mpr_dev, "Cannot allocate memory %s %d\n",
1348		    __func__, __LINE__);
1349		return (ENOMEM);
1350	}
1351	for (i = 1; i < sc->num_reqs; i++) {
1352		cm = &sc->commands[i];
1353		cm->cm_req = sc->req_frames +
1354		    i * sc->facts->IOCRequestFrameSize * 4;
1355		cm->cm_req_busaddr = sc->req_busaddr +
1356		    i * sc->facts->IOCRequestFrameSize * 4;
1357		cm->cm_sense = &sc->sense_frames[i];
1358		cm->cm_sense_busaddr = sc->sense_busaddr + i * MPR_SENSE_LEN;
1359		cm->cm_desc.Default.SMID = i;
1360		cm->cm_sc = sc;
1361		TAILQ_INIT(&cm->cm_chain_list);
1362		TAILQ_INIT(&cm->cm_prp_page_list);
1363		callout_init_mtx(&cm->cm_callout, &sc->mpr_mtx, 0);
1364
1365		/* XXX Is a failure here a critical problem? */
1366		if (bus_dmamap_create(sc->buffer_dmat, 0, &cm->cm_dmamap)
1367		    == 0) {
1368			if (i <= sc->num_prireqs)
1369				mpr_free_high_priority_command(sc, cm);
1370			else
1371				mpr_free_command(sc, cm);
1372		} else {
1373			panic("failed to allocate command %d\n", i);
1374			sc->num_reqs = i;
1375			break;
1376		}
1377	}
1378
1379	return (0);
1380}
1381
1382/*
1383 * Allocate contiguous buffers for PCIe NVMe devices for building native PRPs,
1384 * which are scatter/gather lists for NVMe devices.
1385 *
1386 * This buffer must be contiguous due to the nature of how NVMe PRPs are built
1387 * and translated by FW.
1388 *
1389 * returns ENOMEM if memory could not be allocated, otherwise returns 0.
1390 */
1391static int
1392mpr_alloc_nvme_prp_pages(struct mpr_softc *sc)
1393{
1394	int PRPs_per_page, PRPs_required, pages_required;
1395	int rsize, i;
1396	struct mpr_prp_page *prp_page;
1397
1398	/*
1399	 * Assuming a MAX_IO_SIZE of 1MB and a PAGE_SIZE of 4k, the max number
1400	 * of PRPs (NVMe's Scatter/Gather Element) needed per I/O is:
1401	 * MAX_IO_SIZE / PAGE_SIZE = 256
1402	 *
1403	 * 1 PRP entry in main frame for PRP list pointer still leaves 255 PRPs
1404	 * required for the remainder of the 1MB I/O. 512 PRPs can fit into one
1405	 * page (4096 / 8 = 512), so only one page is required for each I/O.
1406	 *
1407	 * Each of these buffers will need to be contiguous. For simplicity,
1408	 * only one buffer is allocated here, which has all of the space
1409	 * required for the NVMe Queue Depth. If there are problems allocating
1410	 * this one buffer, this function will need to change to allocate
1411	 * individual, contiguous NVME_QDEPTH buffers.
1412	 *
1413	 * The real calculation will use the real max io size. Above is just an
1414	 * example.
1415	 *
1416	 */
1417	PRPs_required = sc->maxio / PAGE_SIZE;
1418	PRPs_per_page = (PAGE_SIZE / PRP_ENTRY_SIZE) - 1;
1419	pages_required = (PRPs_required / PRPs_per_page) + 1;
1420
1421	sc->prp_buffer_size = PAGE_SIZE * pages_required;
1422	rsize = sc->prp_buffer_size * NVME_QDEPTH;
1423	if (bus_dma_tag_create( sc->mpr_parent_dmat,	/* parent */
1424				4, 0,			/* algnmnt, boundary */
1425				BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
1426				BUS_SPACE_MAXADDR,	/* highaddr */
1427				NULL, NULL,		/* filter, filterarg */
1428				rsize,			/* maxsize */
1429				1,			/* nsegments */
1430				rsize,			/* maxsegsize */
1431				0,			/* flags */
1432				NULL, NULL,		/* lockfunc, lockarg */
1433				&sc->prp_page_dmat)) {
1434		device_printf(sc->mpr_dev, "Cannot allocate NVMe PRP DMA "
1435		    "tag\n");
1436		return (ENOMEM);
1437	}
1438	if (bus_dmamem_alloc(sc->prp_page_dmat, (void **)&sc->prp_pages,
1439	    BUS_DMA_NOWAIT, &sc->prp_page_map)) {
1440		device_printf(sc->mpr_dev, "Cannot allocate NVMe PRP memory\n");
1441		return (ENOMEM);
1442	}
1443	bzero(sc->prp_pages, rsize);
1444	bus_dmamap_load(sc->prp_page_dmat, sc->prp_page_map, sc->prp_pages,
1445	    rsize, mpr_memaddr_cb, &sc->prp_page_busaddr, 0);
1446
1447	sc->prps = malloc(sizeof(struct mpr_prp_page) * NVME_QDEPTH, M_MPR,
1448	    M_WAITOK | M_ZERO);
1449	for (i = 0; i < NVME_QDEPTH; i++) {
1450		prp_page = &sc->prps[i];
1451		prp_page->prp_page = (uint64_t *)(sc->prp_pages +
1452		    i * sc->prp_buffer_size);
1453		prp_page->prp_page_busaddr = (uint64_t)(sc->prp_page_busaddr +
1454		    i * sc->prp_buffer_size);
1455		mpr_free_prp_page(sc, prp_page);
1456		sc->prp_pages_free_lowwater++;
1457	}
1458
1459	return (0);
1460}
1461
1462static int
1463mpr_init_queues(struct mpr_softc *sc)
1464{
1465	int i;
1466
1467	memset((uint8_t *)sc->post_queue, 0xff, sc->pqdepth * 8);
1468
1469	/*
1470	 * According to the spec, we need to use one less reply than we
1471	 * have space for on the queue.  So sc->num_replies (the number we
1472	 * use) should be less than sc->fqdepth (allocated size).
1473	 */
1474	if (sc->num_replies >= sc->fqdepth)
1475		return (EINVAL);
1476
1477	/*
1478	 * Initialize all of the free queue entries.
1479	 */
1480	for (i = 0; i < sc->fqdepth; i++) {
1481		sc->free_queue[i] = sc->reply_busaddr +
1482		    (i * sc->facts->ReplyFrameSize * 4);
1483	}
1484	sc->replyfreeindex = sc->num_replies;
1485
1486	return (0);
1487}
1488
1489/* Get the driver parameter tunables.  Lowest priority are the driver defaults.
1490 * Next are the global settings, if they exist.  Highest are the per-unit
1491 * settings, if they exist.
1492 */
1493void
1494mpr_get_tunables(struct mpr_softc *sc)
1495{
1496	char tmpstr[80];
1497
1498	/* XXX default to some debugging for now */
1499	sc->mpr_debug = MPR_INFO | MPR_FAULT;
1500	sc->disable_msix = 0;
1501	sc->disable_msi = 0;
1502	sc->max_chains = MPR_CHAIN_FRAMES;
1503	sc->max_io_pages = MPR_MAXIO_PAGES;
1504	sc->enable_ssu = MPR_SSU_ENABLE_SSD_DISABLE_HDD;
1505	sc->spinup_wait_time = DEFAULT_SPINUP_WAIT;
1506	sc->use_phynum = 1;
1507
1508	/*
1509	 * Grab the global variables.
1510	 */
1511	TUNABLE_INT_FETCH("hw.mpr.debug_level", &sc->mpr_debug);
1512	TUNABLE_INT_FETCH("hw.mpr.disable_msix", &sc->disable_msix);
1513	TUNABLE_INT_FETCH("hw.mpr.disable_msi", &sc->disable_msi);
1514	TUNABLE_INT_FETCH("hw.mpr.max_chains", &sc->max_chains);
1515	TUNABLE_INT_FETCH("hw.mpr.max_io_pages", &sc->max_io_pages);
1516	TUNABLE_INT_FETCH("hw.mpr.enable_ssu", &sc->enable_ssu);
1517	TUNABLE_INT_FETCH("hw.mpr.spinup_wait_time", &sc->spinup_wait_time);
1518	TUNABLE_INT_FETCH("hw.mpr.use_phy_num", &sc->use_phynum);
1519
1520	/* Grab the unit-instance variables */
1521	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.debug_level",
1522	    device_get_unit(sc->mpr_dev));
1523	TUNABLE_INT_FETCH(tmpstr, &sc->mpr_debug);
1524
1525	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.disable_msix",
1526	    device_get_unit(sc->mpr_dev));
1527	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msix);
1528
1529	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.disable_msi",
1530	    device_get_unit(sc->mpr_dev));
1531	TUNABLE_INT_FETCH(tmpstr, &sc->disable_msi);
1532
1533	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_chains",
1534	    device_get_unit(sc->mpr_dev));
1535	TUNABLE_INT_FETCH(tmpstr, &sc->max_chains);
1536
1537	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_io_pages",
1538	    device_get_unit(sc->mpr_dev));
1539	TUNABLE_INT_FETCH(tmpstr, &sc->max_io_pages);
1540
1541	bzero(sc->exclude_ids, sizeof(sc->exclude_ids));
1542	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.exclude_ids",
1543	    device_get_unit(sc->mpr_dev));
1544	TUNABLE_STR_FETCH(tmpstr, sc->exclude_ids, sizeof(sc->exclude_ids));
1545
1546	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.enable_ssu",
1547	    device_get_unit(sc->mpr_dev));
1548	TUNABLE_INT_FETCH(tmpstr, &sc->enable_ssu);
1549
1550	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.spinup_wait_time",
1551	    device_get_unit(sc->mpr_dev));
1552	TUNABLE_INT_FETCH(tmpstr, &sc->spinup_wait_time);
1553
1554	snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.use_phy_num",
1555	    device_get_unit(sc->mpr_dev));
1556	TUNABLE_INT_FETCH(tmpstr, &sc->use_phynum);
1557}
1558
1559static void
1560mpr_setup_sysctl(struct mpr_softc *sc)
1561{
1562	struct sysctl_ctx_list	*sysctl_ctx = NULL;
1563	struct sysctl_oid	*sysctl_tree = NULL;
1564	char tmpstr[80], tmpstr2[80];
1565
1566	/*
1567	 * Setup the sysctl variable so the user can change the debug level
1568	 * on the fly.
1569	 */
1570	snprintf(tmpstr, sizeof(tmpstr), "MPR controller %d",
1571	    device_get_unit(sc->mpr_dev));
1572	snprintf(tmpstr2, sizeof(tmpstr2), "%d", device_get_unit(sc->mpr_dev));
1573
1574	sysctl_ctx = device_get_sysctl_ctx(sc->mpr_dev);
1575	if (sysctl_ctx != NULL)
1576		sysctl_tree = device_get_sysctl_tree(sc->mpr_dev);
1577
1578	if (sysctl_tree == NULL) {
1579		sysctl_ctx_init(&sc->sysctl_ctx);
1580		sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
1581		    SYSCTL_STATIC_CHILDREN(_hw_mpr), OID_AUTO, tmpstr2,
1582		    CTLFLAG_RD, 0, tmpstr);
1583		if (sc->sysctl_tree == NULL)
1584			return;
1585		sysctl_ctx = &sc->sysctl_ctx;
1586		sysctl_tree = sc->sysctl_tree;
1587	}
1588
1589	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1590	    OID_AUTO, "debug_level", CTLFLAG_RW, &sc->mpr_debug, 0,
1591	    "mpr debug level");
1592
1593	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1594	    OID_AUTO, "disable_msix", CTLFLAG_RD, &sc->disable_msix, 0,
1595	    "Disable the use of MSI-X interrupts");
1596
1597	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1598	    OID_AUTO, "disable_msi", CTLFLAG_RD, &sc->disable_msi, 0,
1599	    "Disable the use of MSI interrupts");
1600
1601	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1602	    OID_AUTO, "firmware_version", CTLFLAG_RW, sc->fw_version,
1603	    strlen(sc->fw_version), "firmware version");
1604
1605	SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1606	    OID_AUTO, "driver_version", CTLFLAG_RW, MPR_DRIVER_VERSION,
1607	    strlen(MPR_DRIVER_VERSION), "driver version");
1608
1609	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1610	    OID_AUTO, "io_cmds_active", CTLFLAG_RD,
1611	    &sc->io_cmds_active, 0, "number of currently active commands");
1612
1613	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1614	    OID_AUTO, "io_cmds_highwater", CTLFLAG_RD,
1615	    &sc->io_cmds_highwater, 0, "maximum active commands seen");
1616
1617	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1618	    OID_AUTO, "chain_free", CTLFLAG_RD,
1619	    &sc->chain_free, 0, "number of free chain elements");
1620
1621	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1622	    OID_AUTO, "chain_free_lowwater", CTLFLAG_RD,
1623	    &sc->chain_free_lowwater, 0,"lowest number of free chain elements");
1624
1625	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1626	    OID_AUTO, "max_chains", CTLFLAG_RD,
1627	    &sc->max_chains, 0,"maximum chain frames that will be allocated");
1628
1629	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1630	    OID_AUTO, "max_io_pages", CTLFLAG_RD,
1631	    &sc->max_io_pages, 0,"maximum pages to allow per I/O (if <1 use "
1632	    "IOCFacts)");
1633
1634	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1635	    OID_AUTO, "enable_ssu", CTLFLAG_RW, &sc->enable_ssu, 0,
1636	    "enable SSU to SATA SSD/HDD at shutdown");
1637
1638	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1639	    OID_AUTO, "chain_alloc_fail", CTLFLAG_RD,
1640	    &sc->chain_alloc_fail, "chain allocation failures");
1641
1642	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1643	    OID_AUTO, "spinup_wait_time", CTLFLAG_RD,
1644	    &sc->spinup_wait_time, DEFAULT_SPINUP_WAIT, "seconds to wait for "
1645	    "spinup after SATA ID error");
1646
1647	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1648	    OID_AUTO, "use_phy_num", CTLFLAG_RD, &sc->use_phynum, 0,
1649	    "Use the phy number for enumeration");
1650
1651	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1652	    OID_AUTO, "prp_pages_free", CTLFLAG_RD,
1653	    &sc->prp_pages_free, 0, "number of free PRP pages");
1654
1655	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1656	    OID_AUTO, "prp_pages_free_lowwater", CTLFLAG_RD,
1657	    &sc->prp_pages_free_lowwater, 0,"lowest number of free PRP pages");
1658
1659	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
1660	    OID_AUTO, "prp_page_alloc_fail", CTLFLAG_RD,
1661	    &sc->prp_page_alloc_fail, "PRP page allocation failures");
1662}
1663
1664int
1665mpr_attach(struct mpr_softc *sc)
1666{
1667	int error;
1668
1669	MPR_FUNCTRACE(sc);
1670
1671	mtx_init(&sc->mpr_mtx, "MPR lock", NULL, MTX_DEF);
1672	callout_init_mtx(&sc->periodic, &sc->mpr_mtx, 0);
1673	callout_init_mtx(&sc->device_check_callout, &sc->mpr_mtx, 0);
1674	TAILQ_INIT(&sc->event_list);
1675	timevalclear(&sc->lastfail);
1676
1677	if ((error = mpr_transition_ready(sc)) != 0) {
1678		mpr_printf(sc, "%s failed to transition ready\n", __func__);
1679		return (error);
1680	}
1681
1682	sc->facts = malloc(sizeof(MPI2_IOC_FACTS_REPLY), M_MPR,
1683	    M_ZERO|M_NOWAIT);
1684	if (!sc->facts) {
1685		device_printf(sc->mpr_dev, "Cannot allocate memory %s %d\n",
1686		    __func__, __LINE__);
1687		return (ENOMEM);
1688	}
1689
1690	/*
1691	 * Get IOC Facts and allocate all structures based on this information.
1692	 * A Diag Reset will also call mpr_iocfacts_allocate and re-read the IOC
1693	 * Facts. If relevant values have changed in IOC Facts, this function
1694	 * will free all of the memory based on IOC Facts and reallocate that
1695	 * memory.  If this fails, any allocated memory should already be freed.
1696	 */
1697	if ((error = mpr_iocfacts_allocate(sc, TRUE)) != 0) {
1698		mpr_dprint(sc, MPR_FAULT, "%s IOC Facts based allocation "
1699		    "failed with error %d\n", __func__, error);
1700		return (error);
1701	}
1702
1703	/* Start the periodic watchdog check on the IOC Doorbell */
1704	mpr_periodic(sc);
1705
1706	/*
1707	 * The portenable will kick off discovery events that will drive the
1708	 * rest of the initialization process.  The CAM/SAS module will
1709	 * hold up the boot sequence until discovery is complete.
1710	 */
1711	sc->mpr_ich.ich_func = mpr_startup;
1712	sc->mpr_ich.ich_arg = sc;
1713	if (config_intrhook_establish(&sc->mpr_ich) != 0) {
1714		mpr_dprint(sc, MPR_ERROR, "Cannot establish MPR config hook\n");
1715		error = EINVAL;
1716	}
1717
1718	/*
1719	 * Allow IR to shutdown gracefully when shutdown occurs.
1720	 */
1721	sc->shutdown_eh = EVENTHANDLER_REGISTER(shutdown_final,
1722	    mprsas_ir_shutdown, sc, SHUTDOWN_PRI_DEFAULT);
1723
1724	if (sc->shutdown_eh == NULL)
1725		mpr_dprint(sc, MPR_ERROR, "shutdown event registration "
1726		    "failed\n");
1727
1728	mpr_setup_sysctl(sc);
1729
1730	sc->mpr_flags |= MPR_FLAGS_ATTACH_DONE;
1731
1732	return (error);
1733}
1734
1735/* Run through any late-start handlers. */
1736static void
1737mpr_startup(void *arg)
1738{
1739	struct mpr_softc *sc;
1740
1741	sc = (struct mpr_softc *)arg;
1742
1743	mpr_lock(sc);
1744	mpr_unmask_intr(sc);
1745
1746	/* initialize device mapping tables */
1747	mpr_base_static_config_pages(sc);
1748	mpr_mapping_initialize(sc);
1749	mprsas_startup(sc);
1750	mpr_unlock(sc);
1751}
1752
1753/* Periodic watchdog.  Is called with the driver lock already held. */
1754static void
1755mpr_periodic(void *arg)
1756{
1757	struct mpr_softc *sc;
1758	uint32_t db;
1759
1760	sc = (struct mpr_softc *)arg;
1761	if (sc->mpr_flags & MPR_FLAGS_SHUTDOWN)
1762		return;
1763
1764	db = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
1765	if ((db & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
1766		if ((db & MPI2_DOORBELL_FAULT_CODE_MASK) ==
1767		    IFAULT_IOP_OVER_TEMP_THRESHOLD_EXCEEDED) {
1768			panic("TEMPERATURE FAULT: STOPPING.");
1769		}
1770		mpr_dprint(sc, MPR_FAULT, "IOC Fault 0x%08x, Resetting\n", db);
1771		mpr_reinit(sc);
1772	}
1773
1774	callout_reset(&sc->periodic, MPR_PERIODIC_DELAY * hz, mpr_periodic, sc);
1775}
1776
1777static void
1778mpr_log_evt_handler(struct mpr_softc *sc, uintptr_t data,
1779    MPI2_EVENT_NOTIFICATION_REPLY *event)
1780{
1781	MPI2_EVENT_DATA_LOG_ENTRY_ADDED *entry;
1782
1783	MPR_DPRINT_EVENT(sc, generic, event);
1784
1785	switch (event->Event) {
1786	case MPI2_EVENT_LOG_DATA:
1787		mpr_dprint(sc, MPR_EVENT, "MPI2_EVENT_LOG_DATA:\n");
1788		if (sc->mpr_debug & MPR_EVENT)
1789			hexdump(event->EventData, event->EventDataLength, NULL,
1790			    0);
1791		break;
1792	case MPI2_EVENT_LOG_ENTRY_ADDED:
1793		entry = (MPI2_EVENT_DATA_LOG_ENTRY_ADDED *)event->EventData;
1794		mpr_dprint(sc, MPR_EVENT, "MPI2_EVENT_LOG_ENTRY_ADDED event "
1795		    "0x%x Sequence %d:\n", entry->LogEntryQualifier,
1796		     entry->LogSequence);
1797		break;
1798	default:
1799		break;
1800	}
1801	return;
1802}
1803
1804static int
1805mpr_attach_log(struct mpr_softc *sc)
1806{
1807	uint8_t events[16];
1808
1809	bzero(events, 16);
1810	setbit(events, MPI2_EVENT_LOG_DATA);
1811	setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED);
1812
1813	mpr_register_events(sc, events, mpr_log_evt_handler, NULL,
1814	    &sc->mpr_log_eh);
1815
1816	return (0);
1817}
1818
1819static int
1820mpr_detach_log(struct mpr_softc *sc)
1821{
1822
1823	if (sc->mpr_log_eh != NULL)
1824		mpr_deregister_events(sc, sc->mpr_log_eh);
1825	return (0);
1826}
1827
1828/*
1829 * Free all of the driver resources and detach submodules.  Should be called
1830 * without the lock held.
1831 */
1832int
1833mpr_free(struct mpr_softc *sc)
1834{
1835	int error;
1836
1837	/* Turn off the watchdog */
1838	mpr_lock(sc);
1839	sc->mpr_flags |= MPR_FLAGS_SHUTDOWN;
1840	mpr_unlock(sc);
1841	/* Lock must not be held for this */
1842	callout_drain(&sc->periodic);
1843	callout_drain(&sc->device_check_callout);
1844
1845	if (((error = mpr_detach_log(sc)) != 0) ||
1846	    ((error = mpr_detach_sas(sc)) != 0))
1847		return (error);
1848
1849	mpr_detach_user(sc);
1850
1851	/* Put the IOC back in the READY state. */
1852	mpr_lock(sc);
1853	if ((error = mpr_transition_ready(sc)) != 0) {
1854		mpr_unlock(sc);
1855		return (error);
1856	}
1857	mpr_unlock(sc);
1858
1859	if (sc->facts != NULL)
1860		free(sc->facts, M_MPR);
1861
1862	/*
1863	 * Free all buffers that are based on IOC Facts.  A Diag Reset may need
1864	 * to free these buffers too.
1865	 */
1866	mpr_iocfacts_free(sc);
1867
1868	if (sc->sysctl_tree != NULL)
1869		sysctl_ctx_free(&sc->sysctl_ctx);
1870
1871	/* Deregister the shutdown function */
1872	if (sc->shutdown_eh != NULL)
1873		EVENTHANDLER_DEREGISTER(shutdown_final, sc->shutdown_eh);
1874
1875	mtx_destroy(&sc->mpr_mtx);
1876
1877	return (0);
1878}
1879
1880static __inline void
1881mpr_complete_command(struct mpr_softc *sc, struct mpr_command *cm)
1882{
1883	MPR_FUNCTRACE(sc);
1884
1885	if (cm == NULL) {
1886		mpr_dprint(sc, MPR_ERROR, "Completing NULL command\n");
1887		return;
1888	}
1889
1890	if (cm->cm_flags & MPR_CM_FLAGS_POLLED)
1891		cm->cm_flags |= MPR_CM_FLAGS_COMPLETE;
1892
1893	if (cm->cm_complete != NULL) {
1894		mpr_dprint(sc, MPR_TRACE,
1895		    "%s cm %p calling cm_complete %p data %p reply %p\n",
1896		    __func__, cm, cm->cm_complete, cm->cm_complete_data,
1897		    cm->cm_reply);
1898		cm->cm_complete(sc, cm);
1899	}
1900
1901	if (cm->cm_flags & MPR_CM_FLAGS_WAKEUP) {
1902		mpr_dprint(sc, MPR_TRACE, "waking up %p\n", cm);
1903		wakeup(cm);
1904	}
1905
1906	if (sc->io_cmds_active != 0) {
1907		sc->io_cmds_active--;
1908	} else {
1909		mpr_dprint(sc, MPR_ERROR, "Warning: io_cmds_active is "
1910		    "out of sync - resynching to 0\n");
1911	}
1912}
1913
1914static void
1915mpr_sas_log_info(struct mpr_softc *sc , u32 log_info)
1916{
1917	union loginfo_type {
1918		u32	loginfo;
1919		struct {
1920			u32	subcode:16;
1921			u32	code:8;
1922			u32	originator:4;
1923			u32	bus_type:4;
1924		} dw;
1925	};
1926	union loginfo_type sas_loginfo;
1927	char *originator_str = NULL;
1928
1929	sas_loginfo.loginfo = log_info;
1930	if (sas_loginfo.dw.bus_type != 3 /*SAS*/)
1931		return;
1932
1933	/* each nexus loss loginfo */
1934	if (log_info == 0x31170000)
1935		return;
1936
1937	/* eat the loginfos associated with task aborts */
1938	if ((log_info == 30050000) || (log_info == 0x31140000) ||
1939	    (log_info == 0x31130000))
1940		return;
1941
1942	switch (sas_loginfo.dw.originator) {
1943	case 0:
1944		originator_str = "IOP";
1945		break;
1946	case 1:
1947		originator_str = "PL";
1948		break;
1949	case 2:
1950		originator_str = "IR";
1951		break;
1952	}
1953
1954	mpr_dprint(sc, MPR_LOG, "log_info(0x%08x): originator(%s), "
1955	    "code(0x%02x), sub_code(0x%04x)\n", log_info, originator_str,
1956	    sas_loginfo.dw.code, sas_loginfo.dw.subcode);
1957}
1958
1959static void
1960mpr_display_reply_info(struct mpr_softc *sc, uint8_t *reply)
1961{
1962	MPI2DefaultReply_t *mpi_reply;
1963	u16 sc_status;
1964
1965	mpi_reply = (MPI2DefaultReply_t*)reply;
1966	sc_status = le16toh(mpi_reply->IOCStatus);
1967	if (sc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE)
1968		mpr_sas_log_info(sc, le32toh(mpi_reply->IOCLogInfo));
1969}
1970
1971void
1972mpr_intr(void *data)
1973{
1974	struct mpr_softc *sc;
1975	uint32_t status;
1976
1977	sc = (struct mpr_softc *)data;
1978	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
1979
1980	/*
1981	 * Check interrupt status register to flush the bus.  This is
1982	 * needed for both INTx interrupts and driver-driven polling
1983	 */
1984	status = mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
1985	if ((status & MPI2_HIS_REPLY_DESCRIPTOR_INTERRUPT) == 0)
1986		return;
1987
1988	mpr_lock(sc);
1989	mpr_intr_locked(data);
1990	mpr_unlock(sc);
1991	return;
1992}
1993
1994/*
1995 * In theory, MSI/MSIX interrupts shouldn't need to read any registers on the
1996 * chip.  Hopefully this theory is correct.
1997 */
1998void
1999mpr_intr_msi(void *data)
2000{
2001	struct mpr_softc *sc;
2002
2003	sc = (struct mpr_softc *)data;
2004	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2005	mpr_lock(sc);
2006	mpr_intr_locked(data);
2007	mpr_unlock(sc);
2008	return;
2009}
2010
2011/*
2012 * The locking is overly broad and simplistic, but easy to deal with for now.
2013 */
2014void
2015mpr_intr_locked(void *data)
2016{
2017	MPI2_REPLY_DESCRIPTORS_UNION *desc;
2018	struct mpr_softc *sc;
2019	struct mpr_command *cm = NULL;
2020	uint8_t flags;
2021	u_int pq;
2022	MPI2_DIAG_RELEASE_REPLY *rel_rep;
2023	mpr_fw_diagnostic_buffer_t *pBuffer;
2024
2025	sc = (struct mpr_softc *)data;
2026
2027	pq = sc->replypostindex;
2028	mpr_dprint(sc, MPR_TRACE,
2029	    "%s sc %p starting with replypostindex %u\n",
2030	    __func__, sc, sc->replypostindex);
2031
2032	for ( ;; ) {
2033		cm = NULL;
2034		desc = &sc->post_queue[sc->replypostindex];
2035		flags = desc->Default.ReplyFlags &
2036		    MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
2037		if ((flags == MPI2_RPY_DESCRIPT_FLAGS_UNUSED) ||
2038		    (le32toh(desc->Words.High) == 0xffffffff))
2039			break;
2040
2041		/* increment the replypostindex now, so that event handlers
2042		 * and cm completion handlers which decide to do a diag
2043		 * reset can zero it without it getting incremented again
2044		 * afterwards, and we break out of this loop on the next
2045		 * iteration since the reply post queue has been cleared to
2046		 * 0xFF and all descriptors look unused (which they are).
2047		 */
2048		if (++sc->replypostindex >= sc->pqdepth)
2049			sc->replypostindex = 0;
2050
2051		switch (flags) {
2052		case MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS:
2053		case MPI25_RPY_DESCRIPT_FLAGS_FAST_PATH_SCSI_IO_SUCCESS:
2054		case MPI26_RPY_DESCRIPT_FLAGS_PCIE_ENCAPSULATED_SUCCESS:
2055			cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)];
2056			cm->cm_reply = NULL;
2057			break;
2058		case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY:
2059		{
2060			uint32_t baddr;
2061			uint8_t *reply;
2062
2063			/*
2064			 * Re-compose the reply address from the address
2065			 * sent back from the chip.  The ReplyFrameAddress
2066			 * is the lower 32 bits of the physical address of
2067			 * particular reply frame.  Convert that address to
2068			 * host format, and then use that to provide the
2069			 * offset against the virtual address base
2070			 * (sc->reply_frames).
2071			 */
2072			baddr = le32toh(desc->AddressReply.ReplyFrameAddress);
2073			reply = sc->reply_frames +
2074				(baddr - ((uint32_t)sc->reply_busaddr));
2075			/*
2076			 * Make sure the reply we got back is in a valid
2077			 * range.  If not, go ahead and panic here, since
2078			 * we'll probably panic as soon as we deference the
2079			 * reply pointer anyway.
2080			 */
2081			if ((reply < sc->reply_frames)
2082			 || (reply > (sc->reply_frames +
2083			     (sc->fqdepth * sc->facts->ReplyFrameSize * 4)))) {
2084				printf("%s: WARNING: reply %p out of range!\n",
2085				       __func__, reply);
2086				printf("%s: reply_frames %p, fqdepth %d, "
2087				       "frame size %d\n", __func__,
2088				       sc->reply_frames, sc->fqdepth,
2089				       sc->facts->ReplyFrameSize * 4);
2090				printf("%s: baddr %#x,\n", __func__, baddr);
2091				/* LSI-TODO. See Linux Code for Graceful exit */
2092				panic("Reply address out of range");
2093			}
2094			if (le16toh(desc->AddressReply.SMID) == 0) {
2095				if (((MPI2_DEFAULT_REPLY *)reply)->Function ==
2096				    MPI2_FUNCTION_DIAG_BUFFER_POST) {
2097					/*
2098					 * If SMID is 0 for Diag Buffer Post,
2099					 * this implies that the reply is due to
2100					 * a release function with a status that
2101					 * the buffer has been released.  Set
2102					 * the buffer flags accordingly.
2103					 */
2104					rel_rep =
2105					    (MPI2_DIAG_RELEASE_REPLY *)reply;
2106					if ((le16toh(rel_rep->IOCStatus) &
2107					    MPI2_IOCSTATUS_MASK) ==
2108					    MPI2_IOCSTATUS_DIAGNOSTIC_RELEASED)
2109					{
2110						pBuffer =
2111						    &sc->fw_diag_buffer_list[
2112						    rel_rep->BufferType];
2113						pBuffer->valid_data = TRUE;
2114						pBuffer->owned_by_firmware =
2115						    FALSE;
2116						pBuffer->immediate = FALSE;
2117					}
2118				} else
2119					mpr_dispatch_event(sc, baddr,
2120					    (MPI2_EVENT_NOTIFICATION_REPLY *)
2121					    reply);
2122			} else {
2123				cm = &sc->commands[
2124				    le16toh(desc->AddressReply.SMID)];
2125				cm->cm_reply = reply;
2126				cm->cm_reply_data =
2127				    le32toh(desc->AddressReply.
2128				    ReplyFrameAddress);
2129			}
2130			break;
2131		}
2132		case MPI2_RPY_DESCRIPT_FLAGS_TARGETASSIST_SUCCESS:
2133		case MPI2_RPY_DESCRIPT_FLAGS_TARGET_COMMAND_BUFFER:
2134		case MPI2_RPY_DESCRIPT_FLAGS_RAID_ACCELERATOR_SUCCESS:
2135		default:
2136			/* Unhandled */
2137			mpr_dprint(sc, MPR_ERROR, "Unhandled reply 0x%x\n",
2138			    desc->Default.ReplyFlags);
2139			cm = NULL;
2140			break;
2141		}
2142
2143		if (cm != NULL) {
2144			// Print Error reply frame
2145			if (cm->cm_reply)
2146				mpr_display_reply_info(sc,cm->cm_reply);
2147			mpr_complete_command(sc, cm);
2148		}
2149
2150		desc->Words.Low = 0xffffffff;
2151		desc->Words.High = 0xffffffff;
2152	}
2153
2154	if (pq != sc->replypostindex) {
2155		mpr_dprint(sc, MPR_TRACE,
2156		    "%s sc %p writing postindex %d\n",
2157		    __func__, sc, sc->replypostindex);
2158		mpr_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET,
2159		    sc->replypostindex);
2160	}
2161
2162	return;
2163}
2164
2165static void
2166mpr_dispatch_event(struct mpr_softc *sc, uintptr_t data,
2167    MPI2_EVENT_NOTIFICATION_REPLY *reply)
2168{
2169	struct mpr_event_handle *eh;
2170	int event, handled = 0;
2171
2172	event = le16toh(reply->Event);
2173	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
2174		if (isset(eh->mask, event)) {
2175			eh->callback(sc, data, reply);
2176			handled++;
2177		}
2178	}
2179
2180	if (handled == 0)
2181		mpr_dprint(sc, MPR_EVENT, "Unhandled event 0x%x\n",
2182		    le16toh(event));
2183
2184	/*
2185	 * This is the only place that the event/reply should be freed.
2186	 * Anything wanting to hold onto the event data should have
2187	 * already copied it into their own storage.
2188	 */
2189	mpr_free_reply(sc, data);
2190}
2191
2192static void
2193mpr_reregister_events_complete(struct mpr_softc *sc, struct mpr_command *cm)
2194{
2195	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2196
2197	if (cm->cm_reply)
2198		MPR_DPRINT_EVENT(sc, generic,
2199			(MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply);
2200
2201	mpr_free_command(sc, cm);
2202
2203	/* next, send a port enable */
2204	mprsas_startup(sc);
2205}
2206
2207/*
2208 * For both register_events and update_events, the caller supplies a bitmap
2209 * of events that it _wants_.  These functions then turn that into a bitmask
2210 * suitable for the controller.
2211 */
2212int
2213mpr_register_events(struct mpr_softc *sc, uint8_t *mask,
2214    mpr_evt_callback_t *cb, void *data, struct mpr_event_handle **handle)
2215{
2216	struct mpr_event_handle *eh;
2217	int error = 0;
2218
2219	eh = malloc(sizeof(struct mpr_event_handle), M_MPR, M_WAITOK|M_ZERO);
2220	if (!eh) {
2221		device_printf(sc->mpr_dev, "Cannot allocate memory %s %d\n",
2222		    __func__, __LINE__);
2223		return (ENOMEM);
2224	}
2225	eh->callback = cb;
2226	eh->data = data;
2227	TAILQ_INSERT_TAIL(&sc->event_list, eh, eh_list);
2228	if (mask != NULL)
2229		error = mpr_update_events(sc, eh, mask);
2230	*handle = eh;
2231
2232	return (error);
2233}
2234
2235int
2236mpr_update_events(struct mpr_softc *sc, struct mpr_event_handle *handle,
2237    uint8_t *mask)
2238{
2239	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
2240	MPI2_EVENT_NOTIFICATION_REPLY *reply = NULL;
2241	struct mpr_command *cm = NULL;
2242	struct mpr_event_handle *eh;
2243	int error, i;
2244
2245	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2246
2247	if ((mask != NULL) && (handle != NULL))
2248		bcopy(mask, &handle->mask[0], 16);
2249	memset(sc->event_mask, 0xff, 16);
2250
2251	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
2252		for (i = 0; i < 16; i++)
2253			sc->event_mask[i] &= ~eh->mask[i];
2254	}
2255
2256	if ((cm = mpr_alloc_command(sc)) == NULL)
2257		return (EBUSY);
2258	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
2259	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
2260	evtreq->MsgFlags = 0;
2261	evtreq->SASBroadcastPrimitiveMasks = 0;
2262#ifdef MPR_DEBUG_ALL_EVENTS
2263	{
2264		u_char fullmask[16];
2265		memset(fullmask, 0x00, 16);
2266		bcopy(fullmask, (uint8_t *)&evtreq->EventMasks, 16);
2267	}
2268#else
2269		bcopy(sc->event_mask, (uint8_t *)&evtreq->EventMasks, 16);
2270#endif
2271	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
2272	cm->cm_data = NULL;
2273
2274	error = mpr_request_polled(sc, &cm);
2275	if (cm != NULL)
2276		reply = (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply;
2277	if ((reply == NULL) ||
2278	    (reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
2279		error = ENXIO;
2280
2281	if (reply)
2282		MPR_DPRINT_EVENT(sc, generic, reply);
2283
2284	mpr_dprint(sc, MPR_TRACE, "%s finished error %d\n", __func__, error);
2285
2286	if (cm != NULL)
2287		mpr_free_command(sc, cm);
2288	return (error);
2289}
2290
2291static int
2292mpr_reregister_events(struct mpr_softc *sc)
2293{
2294	MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
2295	struct mpr_command *cm;
2296	struct mpr_event_handle *eh;
2297	int error, i;
2298
2299	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2300
2301	/* first, reregister events */
2302
2303	memset(sc->event_mask, 0xff, 16);
2304
2305	TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
2306		for (i = 0; i < 16; i++)
2307			sc->event_mask[i] &= ~eh->mask[i];
2308	}
2309
2310	if ((cm = mpr_alloc_command(sc)) == NULL)
2311		return (EBUSY);
2312	evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
2313	evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
2314	evtreq->MsgFlags = 0;
2315	evtreq->SASBroadcastPrimitiveMasks = 0;
2316#ifdef MPR_DEBUG_ALL_EVENTS
2317	{
2318		u_char fullmask[16];
2319		memset(fullmask, 0x00, 16);
2320		bcopy(fullmask, (uint8_t *)&evtreq->EventMasks, 16);
2321	}
2322#else
2323		bcopy(sc->event_mask, (uint8_t *)&evtreq->EventMasks, 16);
2324#endif
2325	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
2326	cm->cm_data = NULL;
2327	cm->cm_complete = mpr_reregister_events_complete;
2328
2329	error = mpr_map_command(sc, cm);
2330
2331	mpr_dprint(sc, MPR_TRACE, "%s finished with error %d\n", __func__,
2332	    error);
2333	return (error);
2334}
2335
2336int
2337mpr_deregister_events(struct mpr_softc *sc, struct mpr_event_handle *handle)
2338{
2339
2340	TAILQ_REMOVE(&sc->event_list, handle, eh_list);
2341	free(handle, M_MPR);
2342	return (mpr_update_events(sc, NULL, NULL));
2343}
2344
2345/**
2346* mpr_build_nvme_prp - This function is called for NVMe end devices to build a
2347* native SGL (NVMe PRP). The native SGL is built starting in the first PRP entry
2348* of the NVMe message (PRP1). If the data buffer is small enough to be described
2349* entirely using PRP1, then PRP2 is not used. If needed, PRP2 is used to
2350* describe a larger data buffer. If the data buffer is too large to describe
2351* using the two PRP entriess inside the NVMe message, then PRP1 describes the
2352* first data memory segment, and PRP2 contains a pointer to a PRP list located
2353* elsewhere in memory to describe the remaining data memory segments. The PRP
2354* list will be contiguous.
2355
2356* The native SGL for NVMe devices is a Physical Region Page (PRP). A PRP
2357* consists of a list of PRP entries to describe a number of noncontigous
2358* physical memory segments as a single memory buffer, just as a SGL does. Note
2359* however, that this function is only used by the IOCTL call, so the memory
2360* given will be guaranteed to be contiguous. There is no need to translate
2361* non-contiguous SGL into a PRP in this case. All PRPs will describe contiguous
2362* space that is one page size each.
2363*
2364* Each NVMe message contains two PRP entries. The first (PRP1) either contains
2365* a PRP list pointer or a PRP element, depending upon the command. PRP2 contains
2366* the second PRP element if the memory being described fits within 2 PRP
2367* entries, or a PRP list pointer if the PRP spans more than two entries.
2368*
2369* A PRP list pointer contains the address of a PRP list, structured as a linear
2370* array of PRP entries. Each PRP entry in this list describes a segment of
2371* physical memory.
2372*
2373* Each 64-bit PRP entry comprises an address and an offset field. The address
2374* always points to the beginning of a PAGE_SIZE physical memory page, and the
2375* offset describes where within that page the memory segment begins. Only the
2376* first element in a PRP list may contain a non-zero offest, implying that all
2377* memory segments following the first begin at the start of a PAGE_SIZE page.
2378*
2379* Each PRP element normally describes a chunck of PAGE_SIZE physical memory,
2380* with exceptions for the first and last elements in the list. If the memory
2381* being described by the list begins at a non-zero offset within the first page,
2382* then the first PRP element will contain a non-zero offset indicating where the
2383* region begins within the page. The last memory segment may end before the end
2384* of the PAGE_SIZE segment, depending upon the overall size of the memory being
2385* described by the PRP list.
2386*
2387* Since PRP entries lack any indication of size, the overall data buffer length
2388* is used to determine where the end of the data memory buffer is located, and
2389* how many PRP entries are required to describe it.
2390*
2391* Returns nothing.
2392*/
2393void
2394mpr_build_nvme_prp(struct mpr_softc *sc, struct mpr_command *cm,
2395    Mpi26NVMeEncapsulatedRequest_t *nvme_encap_request, void *data,
2396    uint32_t data_in_sz, uint32_t data_out_sz)
2397{
2398	int			prp_size = PRP_ENTRY_SIZE;
2399	uint64_t		*prp_entry, *prp1_entry, *prp2_entry;
2400	uint64_t		*prp_entry_phys, *prp_page, *prp_page_phys;
2401	uint32_t		offset, entry_len, page_mask_result, page_mask;
2402	bus_addr_t		paddr;
2403	size_t			length;
2404	struct mpr_prp_page	*prp_page_info = NULL;
2405
2406	/*
2407	 * Not all commands require a data transfer. If no data, just return
2408	 * without constructing any PRP.
2409	 */
2410	if (!data_in_sz && !data_out_sz)
2411		return;
2412
2413	/*
2414	 * Set pointers to PRP1 and PRP2, which are in the NVMe command. PRP1 is
2415	 * located at a 24 byte offset from the start of the NVMe command. Then
2416	 * set the current PRP entry pointer to PRP1.
2417	 */
2418	prp1_entry = (uint64_t *)(nvme_encap_request->NVMe_Command +
2419	    NVME_CMD_PRP1_OFFSET);
2420	prp2_entry = (uint64_t *)(nvme_encap_request->NVMe_Command +
2421	    NVME_CMD_PRP2_OFFSET);
2422	prp_entry = prp1_entry;
2423
2424	/*
2425	 * For the PRP entries, use the specially allocated buffer of
2426	 * contiguous memory. PRP Page allocation failures should not happen
2427	 * because there should be enough PRP page buffers to account for the
2428	 * possible NVMe QDepth.
2429	 */
2430	prp_page_info = mpr_alloc_prp_page(sc);
2431	KASSERT(prp_page_info != NULL, ("%s: There are no PRP Pages left to be "
2432	    "used for building a native NVMe SGL.\n", __func__));
2433	prp_page = (uint64_t *)prp_page_info->prp_page;
2434	prp_page_phys = (uint64_t *)(uintptr_t)prp_page_info->prp_page_busaddr;
2435
2436	/*
2437	 * Insert the allocated PRP page into the command's PRP page list. This
2438	 * will be freed when the command is freed.
2439	 */
2440	TAILQ_INSERT_TAIL(&cm->cm_prp_page_list, prp_page_info, prp_page_link);
2441
2442	/*
2443	 * Check if we are within 1 entry of a page boundary we don't want our
2444	 * first entry to be a PRP List entry.
2445	 */
2446	page_mask = PAGE_SIZE - 1;
2447	page_mask_result = (uintptr_t)((uint8_t *)prp_page + prp_size) &
2448	    page_mask;
2449	if (!page_mask_result)
2450	{
2451		/* Bump up to next page boundary. */
2452		prp_page = (uint64_t *)((uint8_t *)prp_page + prp_size);
2453		prp_page_phys = (uint64_t *)((uint8_t *)prp_page_phys +
2454		    prp_size);
2455	}
2456
2457	/*
2458	 * Set PRP physical pointer, which initially points to the current PRP
2459	 * DMA memory page.
2460	 */
2461	prp_entry_phys = prp_page_phys;
2462
2463	/* Get physical address and length of the data buffer. */
2464	paddr = (bus_addr_t)data;
2465	if (data_in_sz)
2466		length = data_in_sz;
2467	else
2468		length = data_out_sz;
2469
2470	/* Loop while the length is not zero. */
2471	while (length)
2472	{
2473		/*
2474		 * Check if we need to put a list pointer here if we are at page
2475		 * boundary - prp_size (8 bytes).
2476		 */
2477		page_mask_result = (uintptr_t)((uint8_t *)prp_entry_phys +
2478		    prp_size) & page_mask;
2479		if (!page_mask_result)
2480		{
2481			/*
2482			 * This is the last entry in a PRP List, so we need to
2483			 * put a PRP list pointer here. What this does is:
2484			 *   - bump the current memory pointer to the next
2485			 *     address, which will be the next full page.
2486			 *   - set the PRP Entry to point to that page. This is
2487			 *     now the PRP List pointer.
2488			 *   - bump the PRP Entry pointer the start of the next
2489			 *     page. Since all of this PRP memory is contiguous,
2490			 *     no need to get a new page - it's just the next
2491			 *     address.
2492			 */
2493			prp_entry_phys++;
2494			*prp_entry =
2495			    htole64((uint64_t)(uintptr_t)prp_entry_phys);
2496			prp_entry++;
2497		}
2498
2499		/* Need to handle if entry will be part of a page. */
2500		offset = (uint32_t)paddr & page_mask;
2501		entry_len = PAGE_SIZE - offset;
2502
2503		if (prp_entry == prp1_entry)
2504		{
2505			/*
2506			 * Must fill in the first PRP pointer (PRP1) before
2507			 * moving on.
2508			 */
2509			*prp1_entry = htole64((uint64_t)paddr);
2510
2511			/*
2512			 * Now point to the second PRP entry within the
2513			 * command (PRP2).
2514			 */
2515			prp_entry = prp2_entry;
2516		}
2517		else if (prp_entry == prp2_entry)
2518		{
2519			/*
2520			 * Should the PRP2 entry be a PRP List pointer or just a
2521			 * regular PRP pointer? If there is more than one more
2522			 * page of data, must use a PRP List pointer.
2523			 */
2524			if (length > PAGE_SIZE)
2525			{
2526				/*
2527				 * PRP2 will contain a PRP List pointer because
2528				 * more PRP's are needed with this command. The
2529				 * list will start at the beginning of the
2530				 * contiguous buffer.
2531				 */
2532				*prp2_entry =
2533				    htole64(
2534				    (uint64_t)(uintptr_t)prp_entry_phys);
2535
2536				/*
2537				 * The next PRP Entry will be the start of the
2538				 * first PRP List.
2539				 */
2540				prp_entry = prp_page;
2541			}
2542			else
2543			{
2544				/*
2545				 * After this, the PRP Entries are complete.
2546				 * This command uses 2 PRP's and no PRP list.
2547				 */
2548				*prp2_entry = htole64((uint64_t)paddr);
2549			}
2550		}
2551		else
2552		{
2553			/*
2554			 * Put entry in list and bump the addresses.
2555			 *
2556			 * After PRP1 and PRP2 are filled in, this will fill in
2557			 * all remaining PRP entries in a PRP List, one per each
2558			 * time through the loop.
2559			 */
2560			*prp_entry = htole64((uint64_t)paddr);
2561			prp_entry++;
2562			prp_entry_phys++;
2563		}
2564
2565		/*
2566		 * Bump the phys address of the command's data buffer by the
2567		 * entry_len.
2568		 */
2569		paddr += entry_len;
2570
2571		/* Decrement length accounting for last partial page. */
2572		if (entry_len > length)
2573			length = 0;
2574		else
2575			length -= entry_len;
2576	}
2577}
2578
2579/*
2580 * mpr_check_pcie_native_sgl - This function is called for PCIe end devices to
2581 * determine if the driver needs to build a native SGL. If so, that native SGL
2582 * is built in the contiguous buffers allocated especially for PCIe SGL
2583 * creation. If the driver will not build a native SGL, return TRUE and a
2584 * normal IEEE SGL will be built. Currently this routine supports NVMe devices
2585 * only.
2586 *
2587 * Returns FALSE (0) if native SGL was built, TRUE (1) if no SGL was built.
2588 */
2589static int
2590mpr_check_pcie_native_sgl(struct mpr_softc *sc, struct mpr_command *cm,
2591    bus_dma_segment_t *segs, int segs_left)
2592{
2593	uint32_t		i, sge_dwords, length, offset, entry_len;
2594	uint32_t		num_entries, buff_len = 0, sges_in_segment;
2595	uint32_t		page_mask, page_mask_result, *curr_buff;
2596	uint32_t		*ptr_sgl, *ptr_first_sgl, first_page_offset;
2597	uint32_t		first_page_data_size, end_residual;
2598	uint64_t		*msg_phys;
2599	bus_addr_t		paddr;
2600	int			build_native_sgl = 0, first_prp_entry;
2601	int			prp_size = PRP_ENTRY_SIZE;
2602	Mpi25IeeeSgeChain64_t	*main_chain_element = NULL;
2603	struct mpr_prp_page	*prp_page_info = NULL;
2604
2605	mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
2606
2607	/*
2608	 * Add up the sizes of each segment length to get the total transfer
2609	 * size, which will be checked against the Maximum Data Transfer Size.
2610	 * If the data transfer length exceeds the MDTS for this device, just
2611	 * return 1 so a normal IEEE SGL will be built. F/W will break the I/O
2612	 * up into multiple I/O's. [nvme_mdts = 0 means unlimited]
2613	 */
2614	for (i = 0; i < segs_left; i++)
2615		buff_len += htole32(segs[i].ds_len);
2616	if ((cm->cm_targ->MDTS > 0) && (buff_len > cm->cm_targ->MDTS))
2617		return 1;
2618
2619	/* Create page_mask (to get offset within page) */
2620	page_mask = PAGE_SIZE - 1;
2621
2622	/*
2623	 * Check if the number of elements exceeds the max number that can be
2624	 * put in the main message frame (H/W can only translate an SGL that
2625	 * is contained entirely in the main message frame).
2626	 */
2627	sges_in_segment = (sc->facts->IOCRequestFrameSize -
2628	    offsetof(Mpi25SCSIIORequest_t, SGL)) / sizeof(MPI25_SGE_IO_UNION);
2629	if (segs_left > sges_in_segment)
2630		build_native_sgl = 1;
2631	else
2632	{
2633		/*
2634		 * NVMe uses one PRP for each physical page (or part of physical
2635		 * page).
2636		 *    if 4 pages or less then IEEE is OK
2637		 *    if > 5 pages then we need to build a native SGL
2638		 *    if > 4 and <= 5 pages, then check the physical address of
2639		 *      the first SG entry, then if this first size in the page
2640		 *      is >= the residual beyond 4 pages then use IEEE,
2641		 *      otherwise use native SGL
2642		 */
2643		if (buff_len > (PAGE_SIZE * 5))
2644			build_native_sgl = 1;
2645		else if ((buff_len > (PAGE_SIZE * 4)) &&
2646		    (buff_len <= (PAGE_SIZE * 5)) )
2647		{
2648			msg_phys = (uint64_t *)segs[0].ds_addr;
2649			first_page_offset =
2650			    ((uint32_t)(uint64_t)(uintptr_t)msg_phys &
2651			    page_mask);
2652			first_page_data_size = PAGE_SIZE - first_page_offset;
2653			end_residual = buff_len % PAGE_SIZE;
2654
2655			/*
2656			 * If offset into first page pushes the end of the data
2657			 * beyond end of the 5th page, we need the extra PRP
2658			 * list.
2659			 */
2660			if (first_page_data_size < end_residual)
2661				build_native_sgl = 1;
2662
2663			/*
2664			 * Check if first SG entry size is < residual beyond 4
2665			 * pages.
2666			 */
2667			if (htole32(segs[0].ds_len) <
2668			    (buff_len - (PAGE_SIZE * 4)))
2669				build_native_sgl = 1;
2670		}
2671	}
2672
2673	/* check if native SGL is needed */
2674	if (!build_native_sgl)
2675		return 1;
2676
2677	/*
2678	 * Native SGL is needed.
2679	 * Put a chain element in main message frame that points to the first
2680	 * chain buffer.
2681	 *
2682	 * NOTE:  The ChainOffset field must be 0 when using a chain pointer to
2683	 *        a native SGL.
2684	 */
2685
2686	/* Set main message chain element pointer */
2687	main_chain_element = (pMpi25IeeeSgeChain64_t)cm->cm_sge;
2688
2689	/*
2690	 * For NVMe the chain element needs to be the 2nd SGL entry in the main
2691	 * message.
2692	 */
2693	main_chain_element = (Mpi25IeeeSgeChain64_t *)
2694	    ((uint8_t *)main_chain_element + sizeof(MPI25_IEEE_SGE_CHAIN64));
2695
2696	/*
2697	 * For the PRP entries, use the specially allocated buffer of
2698	 * contiguous memory. PRP Page allocation failures should not happen
2699	 * because there should be enough PRP page buffers to account for the
2700	 * possible NVMe QDepth.
2701	 */
2702	prp_page_info = mpr_alloc_prp_page(sc);
2703	KASSERT(prp_page_info != NULL, ("%s: There are no PRP Pages left to be "
2704	    "used for building a native NVMe SGL.\n", __func__));
2705	curr_buff = (uint32_t *)prp_page_info->prp_page;
2706	msg_phys = (uint64_t *)(uintptr_t)prp_page_info->prp_page_busaddr;
2707
2708	/*
2709	 * Insert the allocated PRP page into the command's PRP page list. This
2710	 * will be freed when the command is freed.
2711	 */
2712	TAILQ_INSERT_TAIL(&cm->cm_prp_page_list, prp_page_info, prp_page_link);
2713
2714	/*
2715	 * Check if we are within 1 entry of a page boundary we don't want our
2716	 * first entry to be a PRP List entry.
2717	 */
2718	page_mask_result = (uintptr_t)((uint8_t *)curr_buff + prp_size) &
2719	    page_mask;
2720	if (!page_mask_result) {
2721		/* Bump up to next page boundary. */
2722		curr_buff = (uint32_t *)((uint8_t *)curr_buff + prp_size);
2723		msg_phys = (uint64_t *)((uint8_t *)msg_phys + prp_size);
2724	}
2725
2726	/* Fill in the chain element and make it an NVMe segment type. */
2727	main_chain_element->Address.High =
2728	    htole32((uint32_t)((uint64_t)(uintptr_t)msg_phys >> 32));
2729	main_chain_element->Address.Low =
2730	    htole32((uint32_t)(uintptr_t)msg_phys);
2731	main_chain_element->NextChainOffset = 0;
2732	main_chain_element->Flags = MPI2_IEEE_SGE_FLAGS_CHAIN_ELEMENT |
2733	    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR |
2734	    MPI26_IEEE_SGE_FLAGS_NSF_NVME_PRP;
2735
2736	/* Set SGL pointer to start of contiguous PCIe buffer. */
2737	ptr_sgl = curr_buff;
2738	sge_dwords = 2;
2739	num_entries = 0;
2740
2741	/*
2742	 * NVMe has a very convoluted PRP format. One PRP is required for each
2743	 * page or partial page. We need to split up OS SG entries if they are
2744	 * longer than one page or cross a page boundary. We also have to insert
2745	 * a PRP list pointer entry as the last entry in each physical page of
2746	 * the PRP list.
2747	 *
2748	 * NOTE: The first PRP "entry" is actually placed in the first SGL entry
2749	 * in the main message in IEEE 64 format. The 2nd entry in the main
2750	 * message is the chain element, and the rest of the PRP entries are
2751	 * built in the contiguous PCIe buffer.
2752	 */
2753	first_prp_entry = 1;
2754	ptr_first_sgl = (uint32_t *)cm->cm_sge;
2755
2756	for (i = 0; i < segs_left; i++) {
2757		/* Get physical address and length of this SG entry. */
2758		paddr = segs[i].ds_addr;
2759		length = segs[i].ds_len;
2760
2761		/*
2762		 * Check whether a given SGE buffer lies on a non-PAGED
2763		 * boundary if this is not the first page. If so, this is not
2764		 * expected so have FW build the SGL.
2765		 */
2766		if (i) {
2767			if ((uint32_t)paddr & page_mask) {
2768				mpr_dprint(sc, MPR_ERROR, "Unaligned SGE while "
2769				    "building NVMe PRPs, low address is 0x%x\n",
2770				    (uint32_t)paddr);
2771				return 1;
2772			}
2773		}
2774
2775		/* Apart from last SGE, if any other SGE boundary is not page
2776		 * aligned then it means that hole exists. Existence of hole
2777		 * leads to data corruption. So fallback to IEEE SGEs.
2778		 */
2779		if (i != (segs_left - 1)) {
2780			if (((uint32_t)paddr + length) & page_mask) {
2781				mpr_dprint(sc, MPR_ERROR, "Unaligned SGE "
2782				    "boundary while building NVMe PRPs, low "
2783				    "address: 0x%x and length: %u\n",
2784				    (uint32_t)paddr, length);
2785				return 1;
2786			}
2787		}
2788
2789		/* Loop while the length is not zero. */
2790		while (length) {
2791			/*
2792			 * Check if we need to put a list pointer here if we are
2793			 * at page boundary - prp_size.
2794			 */
2795			page_mask_result = (uintptr_t)((uint8_t *)ptr_sgl +
2796			    prp_size) & page_mask;
2797			if (!page_mask_result) {
2798				/*
2799				 * Need to put a PRP list pointer here.
2800				 */
2801				msg_phys = (uint64_t *)((uint8_t *)msg_phys +
2802				    prp_size);
2803				*ptr_sgl = htole32((uintptr_t)msg_phys);
2804				*(ptr_sgl+1) = htole32((uint64_t)(uintptr_t)
2805				    msg_phys >> 32);
2806				ptr_sgl += sge_dwords;
2807				num_entries++;
2808			}
2809
2810			/* Need to handle if entry will be part of a page. */
2811			offset = (uint32_t)paddr & page_mask;
2812			entry_len = PAGE_SIZE - offset;
2813			if (first_prp_entry) {
2814				/*
2815				 * Put IEEE entry in first SGE in main message.
2816				 * (Simple element, System addr, not end of
2817				 * list.)
2818				 */
2819				*ptr_first_sgl = htole32((uint32_t)paddr);
2820				*(ptr_first_sgl + 1) =
2821				    htole32((uint32_t)((uint64_t)paddr >> 32));
2822				*(ptr_first_sgl + 2) = htole32(entry_len);
2823				*(ptr_first_sgl + 3) = 0;
2824
2825				/* No longer the first PRP entry. */
2826				first_prp_entry = 0;
2827			} else {
2828				/* Put entry in list. */
2829				*ptr_sgl = htole32((uint32_t)paddr);
2830				*(ptr_sgl + 1) =
2831				    htole32((uint32_t)((uint64_t)paddr >> 32));
2832
2833				/* Bump ptr_sgl, msg_phys, and num_entries. */
2834				ptr_sgl += sge_dwords;
2835				msg_phys = (uint64_t *)((uint8_t *)msg_phys +
2836				    prp_size);
2837				num_entries++;
2838			}
2839
2840			/* Bump the phys address by the entry_len. */
2841			paddr += entry_len;
2842
2843			/* Decrement length accounting for last partial page. */
2844			if (entry_len > length)
2845				length = 0;
2846			else
2847				length -= entry_len;
2848		}
2849	}
2850
2851	/* Set chain element Length. */
2852	main_chain_element->Length = htole32(num_entries * prp_size);
2853
2854	/* Return 0, indicating we built a native SGL. */
2855	return 0;
2856}
2857
2858/*
2859 * Add a chain element as the next SGE for the specified command.
2860 * Reset cm_sge and cm_sgesize to indicate all the available space. Chains are
2861 * only required for IEEE commands.  Therefore there is no code for commands
2862 * that have the MPR_CM_FLAGS_SGE_SIMPLE flag set (and those commands
2863 * shouldn't be requesting chains).
2864 */
2865static int
2866mpr_add_chain(struct mpr_command *cm, int segsleft)
2867{
2868	struct mpr_softc *sc = cm->cm_sc;
2869	MPI2_REQUEST_HEADER *req;
2870	MPI25_IEEE_SGE_CHAIN64 *ieee_sgc;
2871	struct mpr_chain *chain;
2872	int sgc_size, current_segs, rem_segs, segs_per_frame;
2873	uint8_t next_chain_offset = 0;
2874
2875	/*
2876	 * Fail if a command is requesting a chain for SIMPLE SGE's.  For SAS3
2877	 * only IEEE commands should be requesting chains.  Return some error
2878	 * code other than 0.
2879	 */
2880	if (cm->cm_flags & MPR_CM_FLAGS_SGE_SIMPLE) {
2881		mpr_dprint(sc, MPR_ERROR, "A chain element cannot be added to "
2882		    "an MPI SGL.\n");
2883		return(ENOBUFS);
2884	}
2885
2886	sgc_size = sizeof(MPI25_IEEE_SGE_CHAIN64);
2887	if (cm->cm_sglsize < sgc_size)
2888		panic("MPR: Need SGE Error Code\n");
2889
2890	chain = mpr_alloc_chain(cm->cm_sc);
2891	if (chain == NULL)
2892		return (ENOBUFS);
2893
2894	/*
2895	 * Note: a double-linked list is used to make it easier to walk for
2896	 * debugging.
2897	 */
2898	TAILQ_INSERT_TAIL(&cm->cm_chain_list, chain, chain_link);
2899
2900	/*
2901	 * Need to know if the number of frames left is more than 1 or not.  If
2902	 * more than 1 frame is required, NextChainOffset will need to be set,
2903	 * which will just be the last segment of the frame.
2904	 */
2905	rem_segs = 0;
2906	if (cm->cm_sglsize < (sgc_size * segsleft)) {
2907		/*
2908		 * rem_segs is the number of segements remaining after the
2909		 * segments that will go into the current frame.  Since it is
2910		 * known that at least one more frame is required, account for
2911		 * the chain element.  To know if more than one more frame is
2912		 * required, just check if there will be a remainder after using
2913		 * the current frame (with this chain) and the next frame.  If
2914		 * so the NextChainOffset must be the last element of the next
2915		 * frame.
2916		 */
2917		current_segs = (cm->cm_sglsize / sgc_size) - 1;
2918		rem_segs = segsleft - current_segs;
2919		segs_per_frame = sc->chain_frame_size / sgc_size;
2920		if (rem_segs > segs_per_frame) {
2921			next_chain_offset = segs_per_frame - 1;
2922		}
2923	}
2924	ieee_sgc = &((MPI25_SGE_IO_UNION *)cm->cm_sge)->IeeeChain;
2925	ieee_sgc->Length = next_chain_offset ?
2926	    htole32((uint32_t)sc->chain_frame_size) :
2927	    htole32((uint32_t)rem_segs * (uint32_t)sgc_size);
2928	ieee_sgc->NextChainOffset = next_chain_offset;
2929	ieee_sgc->Flags = (MPI2_IEEE_SGE_FLAGS_CHAIN_ELEMENT |
2930	    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
2931	ieee_sgc->Address.Low = htole32(chain->chain_busaddr);
2932	ieee_sgc->Address.High = htole32(chain->chain_busaddr >> 32);
2933	cm->cm_sge = &((MPI25_SGE_IO_UNION *)chain->chain)->IeeeSimple;
2934	req = (MPI2_REQUEST_HEADER *)cm->cm_req;
2935	req->ChainOffset = (sc->chain_frame_size - sgc_size) >> 4;
2936
2937	cm->cm_sglsize = sc->chain_frame_size;
2938	return (0);
2939}
2940
2941/*
2942 * Add one scatter-gather element to the scatter-gather list for a command.
2943 * Maintain cm_sglsize and cm_sge as the remaining size and pointer to the
2944 * next SGE to fill in, respectively.  In Gen3, the MPI SGL does not have a
2945 * chain, so don't consider any chain additions.
2946 */
2947int
2948mpr_push_sge(struct mpr_command *cm, MPI2_SGE_SIMPLE64 *sge, size_t len,
2949    int segsleft)
2950{
2951	uint32_t saved_buf_len, saved_address_low, saved_address_high;
2952	u32 sge_flags;
2953
2954	/*
2955	 * case 1: >=1 more segment, no room for anything (error)
2956	 * case 2: 1 more segment and enough room for it
2957         */
2958
2959	if (cm->cm_sglsize < (segsleft * sizeof(MPI2_SGE_SIMPLE64))) {
2960		mpr_dprint(cm->cm_sc, MPR_ERROR,
2961		    "%s: warning: Not enough room for MPI SGL in frame.\n",
2962		    __func__);
2963		return(ENOBUFS);
2964	}
2965
2966	KASSERT(segsleft == 1,
2967	    ("segsleft cannot be more than 1 for an MPI SGL; segsleft = %d\n",
2968	    segsleft));
2969
2970	/*
2971	 * There is one more segment left to add for the MPI SGL and there is
2972	 * enough room in the frame to add it.  This is the normal case because
2973	 * MPI SGL's don't have chains, otherwise something is wrong.
2974	 *
2975	 * If this is a bi-directional request, need to account for that
2976	 * here.  Save the pre-filled sge values.  These will be used
2977	 * either for the 2nd SGL or for a single direction SGL.  If
2978	 * cm_out_len is non-zero, this is a bi-directional request, so
2979	 * fill in the OUT SGL first, then the IN SGL, otherwise just
2980	 * fill in the IN SGL.  Note that at this time, when filling in
2981	 * 2 SGL's for a bi-directional request, they both use the same
2982	 * DMA buffer (same cm command).
2983	 */
2984	saved_buf_len = sge->FlagsLength & 0x00FFFFFF;
2985	saved_address_low = sge->Address.Low;
2986	saved_address_high = sge->Address.High;
2987	if (cm->cm_out_len) {
2988		sge->FlagsLength = cm->cm_out_len |
2989		    ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
2990		    MPI2_SGE_FLAGS_END_OF_BUFFER |
2991		    MPI2_SGE_FLAGS_HOST_TO_IOC |
2992		    MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
2993		    MPI2_SGE_FLAGS_SHIFT);
2994		cm->cm_sglsize -= len;
2995		/* Endian Safe code */
2996		sge_flags = sge->FlagsLength;
2997		sge->FlagsLength = htole32(sge_flags);
2998		sge->Address.High = htole32(sge->Address.High);
2999		sge->Address.Low = htole32(sge->Address.Low);
3000		bcopy(sge, cm->cm_sge, len);
3001		cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
3002	}
3003	sge->FlagsLength = saved_buf_len |
3004	    ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
3005	    MPI2_SGE_FLAGS_END_OF_BUFFER |
3006	    MPI2_SGE_FLAGS_LAST_ELEMENT |
3007	    MPI2_SGE_FLAGS_END_OF_LIST |
3008	    MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
3009	    MPI2_SGE_FLAGS_SHIFT);
3010	if (cm->cm_flags & MPR_CM_FLAGS_DATAIN) {
3011		sge->FlagsLength |=
3012		    ((uint32_t)(MPI2_SGE_FLAGS_IOC_TO_HOST) <<
3013		    MPI2_SGE_FLAGS_SHIFT);
3014	} else {
3015		sge->FlagsLength |=
3016		    ((uint32_t)(MPI2_SGE_FLAGS_HOST_TO_IOC) <<
3017		    MPI2_SGE_FLAGS_SHIFT);
3018	}
3019	sge->Address.Low = saved_address_low;
3020	sge->Address.High = saved_address_high;
3021
3022	cm->cm_sglsize -= len;
3023	/* Endian Safe code */
3024	sge_flags = sge->FlagsLength;
3025	sge->FlagsLength = htole32(sge_flags);
3026	sge->Address.High = htole32(sge->Address.High);
3027	sge->Address.Low = htole32(sge->Address.Low);
3028	bcopy(sge, cm->cm_sge, len);
3029	cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
3030	return (0);
3031}
3032
3033/*
3034 * Add one IEEE scatter-gather element (chain or simple) to the IEEE scatter-
3035 * gather list for a command.  Maintain cm_sglsize and cm_sge as the
3036 * remaining size and pointer to the next SGE to fill in, respectively.
3037 */
3038int
3039mpr_push_ieee_sge(struct mpr_command *cm, void *sgep, int segsleft)
3040{
3041	MPI2_IEEE_SGE_SIMPLE64 *sge = sgep;
3042	int error, ieee_sge_size = sizeof(MPI25_SGE_IO_UNION);
3043	uint32_t saved_buf_len, saved_address_low, saved_address_high;
3044	uint32_t sge_length;
3045
3046	/*
3047	 * case 1: No room for chain or segment (error).
3048	 * case 2: Two or more segments left but only room for chain.
3049	 * case 3: Last segment and room for it, so set flags.
3050	 */
3051
3052	/*
3053	 * There should be room for at least one element, or there is a big
3054	 * problem.
3055	 */
3056	if (cm->cm_sglsize < ieee_sge_size)
3057		panic("MPR: Need SGE Error Code\n");
3058
3059	if ((segsleft >= 2) && (cm->cm_sglsize < (ieee_sge_size * 2))) {
3060		if ((error = mpr_add_chain(cm, segsleft)) != 0)
3061			return (error);
3062	}
3063
3064	if (segsleft == 1) {
3065		/*
3066		 * If this is a bi-directional request, need to account for that
3067		 * here.  Save the pre-filled sge values.  These will be used
3068		 * either for the 2nd SGL or for a single direction SGL.  If
3069		 * cm_out_len is non-zero, this is a bi-directional request, so
3070		 * fill in the OUT SGL first, then the IN SGL, otherwise just
3071		 * fill in the IN SGL.  Note that at this time, when filling in
3072		 * 2 SGL's for a bi-directional request, they both use the same
3073		 * DMA buffer (same cm command).
3074		 */
3075		saved_buf_len = sge->Length;
3076		saved_address_low = sge->Address.Low;
3077		saved_address_high = sge->Address.High;
3078		if (cm->cm_out_len) {
3079			sge->Length = cm->cm_out_len;
3080			sge->Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT |
3081			    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
3082			cm->cm_sglsize -= ieee_sge_size;
3083			/* Endian Safe code */
3084			sge_length = sge->Length;
3085			sge->Length = htole32(sge_length);
3086			sge->Address.High = htole32(sge->Address.High);
3087			sge->Address.Low = htole32(sge->Address.Low);
3088			bcopy(sgep, cm->cm_sge, ieee_sge_size);
3089			cm->cm_sge =
3090			    (MPI25_SGE_IO_UNION *)((uintptr_t)cm->cm_sge +
3091			    ieee_sge_size);
3092		}
3093		sge->Length = saved_buf_len;
3094		sge->Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT |
3095		    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR |
3096		    MPI25_IEEE_SGE_FLAGS_END_OF_LIST);
3097		sge->Address.Low = saved_address_low;
3098		sge->Address.High = saved_address_high;
3099	}
3100
3101	cm->cm_sglsize -= ieee_sge_size;
3102	/* Endian Safe code */
3103	sge_length = sge->Length;
3104	sge->Length = htole32(sge_length);
3105	sge->Address.High = htole32(sge->Address.High);
3106	sge->Address.Low = htole32(sge->Address.Low);
3107	bcopy(sgep, cm->cm_sge, ieee_sge_size);
3108	cm->cm_sge = (MPI25_SGE_IO_UNION *)((uintptr_t)cm->cm_sge +
3109	    ieee_sge_size);
3110	return (0);
3111}
3112
3113/*
3114 * Add one dma segment to the scatter-gather list for a command.
3115 */
3116int
3117mpr_add_dmaseg(struct mpr_command *cm, vm_paddr_t pa, size_t len, u_int flags,
3118    int segsleft)
3119{
3120	MPI2_SGE_SIMPLE64 sge;
3121	MPI2_IEEE_SGE_SIMPLE64 ieee_sge;
3122
3123	if (!(cm->cm_flags & MPR_CM_FLAGS_SGE_SIMPLE)) {
3124		ieee_sge.Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT |
3125		    MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
3126		ieee_sge.Length = len;
3127		mpr_from_u64(pa, &ieee_sge.Address);
3128
3129		return (mpr_push_ieee_sge(cm, &ieee_sge, segsleft));
3130	} else {
3131		/*
3132		 * This driver always uses 64-bit address elements for
3133		 * simplicity.
3134		 */
3135		flags |= MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
3136		    MPI2_SGE_FLAGS_64_BIT_ADDRESSING;
3137		/* Set Endian safe macro in mpr_push_sge */
3138		sge.FlagsLength = len | (flags << MPI2_SGE_FLAGS_SHIFT);
3139		mpr_from_u64(pa, &sge.Address);
3140
3141		return (mpr_push_sge(cm, &sge, sizeof sge, segsleft));
3142	}
3143}
3144
3145static void
3146mpr_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
3147{
3148	struct mpr_softc *sc;
3149	struct mpr_command *cm;
3150	u_int i, dir, sflags;
3151
3152	cm = (struct mpr_command *)arg;
3153	sc = cm->cm_sc;
3154
3155	/*
3156	 * In this case, just print out a warning and let the chip tell the
3157	 * user they did the wrong thing.
3158	 */
3159	if ((cm->cm_max_segs != 0) && (nsegs > cm->cm_max_segs)) {
3160		mpr_dprint(sc, MPR_ERROR, "%s: warning: busdma returned %d "
3161		    "segments, more than the %d allowed\n", __func__, nsegs,
3162		    cm->cm_max_segs);
3163	}
3164
3165	/*
3166	 * Set up DMA direction flags.  Bi-directional requests are also handled
3167	 * here.  In that case, both direction flags will be set.
3168	 */
3169	sflags = 0;
3170	if (cm->cm_flags & MPR_CM_FLAGS_SMP_PASS) {
3171		/*
3172		 * We have to add a special case for SMP passthrough, there
3173		 * is no easy way to generically handle it.  The first
3174		 * S/G element is used for the command (therefore the
3175		 * direction bit needs to be set).  The second one is used
3176		 * for the reply.  We'll leave it to the caller to make
3177		 * sure we only have two buffers.
3178		 */
3179		/*
3180		 * Even though the busdma man page says it doesn't make
3181		 * sense to have both direction flags, it does in this case.
3182		 * We have one s/g element being accessed in each direction.
3183		 */
3184		dir = BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD;
3185
3186		/*
3187		 * Set the direction flag on the first buffer in the SMP
3188		 * passthrough request.  We'll clear it for the second one.
3189		 */
3190		sflags |= MPI2_SGE_FLAGS_DIRECTION |
3191			  MPI2_SGE_FLAGS_END_OF_BUFFER;
3192	} else if (cm->cm_flags & MPR_CM_FLAGS_DATAOUT) {
3193		sflags |= MPI2_SGE_FLAGS_HOST_TO_IOC;
3194		dir = BUS_DMASYNC_PREWRITE;
3195	} else
3196		dir = BUS_DMASYNC_PREREAD;
3197
3198	/* Check if a native SG list is needed for an NVMe PCIe device. */
3199	if (cm->cm_targ && cm->cm_targ->is_nvme &&
3200	    mpr_check_pcie_native_sgl(sc, cm, segs, nsegs) == 0) {
3201		/* A native SG list was built, skip to end. */
3202		goto out;
3203	}
3204
3205	for (i = 0; i < nsegs; i++) {
3206		if ((cm->cm_flags & MPR_CM_FLAGS_SMP_PASS) && (i != 0)) {
3207			sflags &= ~MPI2_SGE_FLAGS_DIRECTION;
3208		}
3209		error = mpr_add_dmaseg(cm, segs[i].ds_addr, segs[i].ds_len,
3210		    sflags, nsegs - i);
3211		if (error != 0) {
3212			/* Resource shortage, roll back! */
3213			if (ratecheck(&sc->lastfail, &mpr_chainfail_interval))
3214				mpr_dprint(sc, MPR_INFO, "Out of chain frames, "
3215				    "consider increasing hw.mpr.max_chains.\n");
3216			cm->cm_flags |= MPR_CM_FLAGS_CHAIN_FAILED;
3217			mpr_complete_command(sc, cm);
3218			return;
3219		}
3220	}
3221
3222out:
3223	bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir);
3224	mpr_enqueue_request(sc, cm);
3225
3226	return;
3227}
3228
3229static void
3230mpr_data_cb2(void *arg, bus_dma_segment_t *segs, int nsegs, bus_size_t mapsize,
3231	     int error)
3232{
3233	mpr_data_cb(arg, segs, nsegs, error);
3234}
3235
3236/*
3237 * This is the routine to enqueue commands ansynchronously.
3238 * Note that the only error path here is from bus_dmamap_load(), which can
3239 * return EINPROGRESS if it is waiting for resources.  Other than this, it's
3240 * assumed that if you have a command in-hand, then you have enough credits
3241 * to use it.
3242 */
3243int
3244mpr_map_command(struct mpr_softc *sc, struct mpr_command *cm)
3245{
3246	int error = 0;
3247
3248	if (cm->cm_flags & MPR_CM_FLAGS_USE_UIO) {
3249		error = bus_dmamap_load_uio(sc->buffer_dmat, cm->cm_dmamap,
3250		    &cm->cm_uio, mpr_data_cb2, cm, 0);
3251	} else if (cm->cm_flags & MPR_CM_FLAGS_USE_CCB) {
3252		error = bus_dmamap_load_ccb(sc->buffer_dmat, cm->cm_dmamap,
3253		    cm->cm_data, mpr_data_cb, cm, 0);
3254	} else if ((cm->cm_data != NULL) && (cm->cm_length != 0)) {
3255		error = bus_dmamap_load(sc->buffer_dmat, cm->cm_dmamap,
3256		    cm->cm_data, cm->cm_length, mpr_data_cb, cm, 0);
3257	} else {
3258		/* Add a zero-length element as needed */
3259		if (cm->cm_sge != NULL)
3260			mpr_add_dmaseg(cm, 0, 0, 0, 1);
3261		mpr_enqueue_request(sc, cm);
3262	}
3263
3264	return (error);
3265}
3266
3267/*
3268 * This is the routine to enqueue commands synchronously.  An error of
3269 * EINPROGRESS from mpr_map_command() is ignored since the command will
3270 * be executed and enqueued automatically.  Other errors come from msleep().
3271 */
3272int
3273mpr_wait_command(struct mpr_softc *sc, struct mpr_command **cmp, int timeout,
3274    int sleep_flag)
3275{
3276	int error, rc;
3277	struct timeval cur_time, start_time;
3278	struct mpr_command *cm = *cmp;
3279
3280	if (sc->mpr_flags & MPR_FLAGS_DIAGRESET)
3281		return  EBUSY;
3282
3283	cm->cm_complete = NULL;
3284	cm->cm_flags |= (MPR_CM_FLAGS_WAKEUP + MPR_CM_FLAGS_POLLED);
3285	error = mpr_map_command(sc, cm);
3286	if ((error != 0) && (error != EINPROGRESS))
3287		return (error);
3288
3289	// Check for context and wait for 50 mSec at a time until time has
3290	// expired or the command has finished.  If msleep can't be used, need
3291	// to poll.
3292#if __FreeBSD_version >= 1000029
3293	if (curthread->td_no_sleeping)
3294#else //__FreeBSD_version < 1000029
3295	if (curthread->td_pflags & TDP_NOSLEEPING)
3296#endif //__FreeBSD_version >= 1000029
3297		sleep_flag = NO_SLEEP;
3298	getmicrouptime(&start_time);
3299	if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP) {
3300		error = msleep(cm, &sc->mpr_mtx, 0, "mprwait", timeout*hz);
3301		if (error == EWOULDBLOCK) {
3302			/*
3303			 * Record the actual elapsed time in the case of a
3304			 * timeout for the message below.
3305			 */
3306			getmicrouptime(&cur_time);
3307			timevalsub(&cur_time, &start_time);
3308		}
3309	} else {
3310		while ((cm->cm_flags & MPR_CM_FLAGS_COMPLETE) == 0) {
3311			mpr_intr_locked(sc);
3312			if (sleep_flag == CAN_SLEEP)
3313				pause("mprwait", hz/20);
3314			else
3315				DELAY(50000);
3316
3317			getmicrouptime(&cur_time);
3318			timevalsub(&cur_time, &start_time);
3319			if (cur_time.tv_sec > timeout) {
3320				error = EWOULDBLOCK;
3321				break;
3322			}
3323		}
3324	}
3325
3326	if (error == EWOULDBLOCK) {
3327		mpr_dprint(sc, MPR_FAULT, "Calling Reinit from %s, timeout=%d,"
3328		    " elapsed=%jd\n", __func__, timeout,
3329		    (intmax_t)cur_time.tv_sec);
3330		rc = mpr_reinit(sc);
3331		mpr_dprint(sc, MPR_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
3332		    "failed");
3333		if (sc->mpr_flags & MPR_FLAGS_REALLOCATED) {
3334			/*
3335			 * Tell the caller that we freed the command in a
3336			 * reinit.
3337			 */
3338			*cmp = NULL;
3339		}
3340		error = ETIMEDOUT;
3341	}
3342	return (error);
3343}
3344
3345/*
3346 * This is the routine to enqueue a command synchonously and poll for
3347 * completion.  Its use should be rare.
3348 */
3349int
3350mpr_request_polled(struct mpr_softc *sc, struct mpr_command **cmp)
3351{
3352	int error, rc;
3353	struct timeval cur_time, start_time;
3354	struct mpr_command *cm = *cmp;
3355
3356	error = 0;
3357
3358	cm->cm_flags |= MPR_CM_FLAGS_POLLED;
3359	cm->cm_complete = NULL;
3360	mpr_map_command(sc, cm);
3361
3362	getmicrouptime(&start_time);
3363	while ((cm->cm_flags & MPR_CM_FLAGS_COMPLETE) == 0) {
3364		mpr_intr_locked(sc);
3365
3366		if (mtx_owned(&sc->mpr_mtx))
3367			msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
3368			    "mprpoll", hz/20);
3369		else
3370			pause("mprpoll", hz/20);
3371
3372		/*
3373		 * Check for real-time timeout and fail if more than 60 seconds.
3374		 */
3375		getmicrouptime(&cur_time);
3376		timevalsub(&cur_time, &start_time);
3377		if (cur_time.tv_sec > 60) {
3378			mpr_dprint(sc, MPR_FAULT, "polling failed\n");
3379			error = ETIMEDOUT;
3380			break;
3381		}
3382	}
3383
3384	if (error) {
3385		mpr_dprint(sc, MPR_FAULT, "Calling Reinit from %s\n", __func__);
3386		rc = mpr_reinit(sc);
3387		mpr_dprint(sc, MPR_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
3388		    "failed");
3389
3390		if (sc->mpr_flags & MPR_FLAGS_REALLOCATED) {
3391			/*
3392			 * Tell the caller that we freed the command in a
3393			 * reinit.
3394			 */
3395			*cmp = NULL;
3396		}
3397	}
3398	return (error);
3399}
3400
3401/*
3402 * The MPT driver had a verbose interface for config pages.  In this driver,
3403 * reduce it to much simpler terms, similar to the Linux driver.
3404 */
3405int
3406mpr_read_config_page(struct mpr_softc *sc, struct mpr_config_params *params)
3407{
3408	MPI2_CONFIG_REQUEST *req;
3409	struct mpr_command *cm;
3410	int error;
3411
3412	if (sc->mpr_flags & MPR_FLAGS_BUSY) {
3413		return (EBUSY);
3414	}
3415
3416	cm = mpr_alloc_command(sc);
3417	if (cm == NULL) {
3418		return (EBUSY);
3419	}
3420
3421	req = (MPI2_CONFIG_REQUEST *)cm->cm_req;
3422	req->Function = MPI2_FUNCTION_CONFIG;
3423	req->Action = params->action;
3424	req->SGLFlags = 0;
3425	req->ChainOffset = 0;
3426	req->PageAddress = params->page_address;
3427	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
3428		MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr;
3429
3430		hdr = &params->hdr.Ext;
3431		req->ExtPageType = hdr->ExtPageType;
3432		req->ExtPageLength = hdr->ExtPageLength;
3433		req->Header.PageType = MPI2_CONFIG_PAGETYPE_EXTENDED;
3434		req->Header.PageLength = 0; /* Must be set to zero */
3435		req->Header.PageNumber = hdr->PageNumber;
3436		req->Header.PageVersion = hdr->PageVersion;
3437	} else {
3438		MPI2_CONFIG_PAGE_HEADER *hdr;
3439
3440		hdr = &params->hdr.Struct;
3441		req->Header.PageType = hdr->PageType;
3442		req->Header.PageNumber = hdr->PageNumber;
3443		req->Header.PageLength = hdr->PageLength;
3444		req->Header.PageVersion = hdr->PageVersion;
3445	}
3446
3447	cm->cm_data = params->buffer;
3448	cm->cm_length = params->length;
3449	if (cm->cm_data != NULL) {
3450		cm->cm_sge = &req->PageBufferSGE;
3451		cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION);
3452		cm->cm_flags = MPR_CM_FLAGS_SGE_SIMPLE | MPR_CM_FLAGS_DATAIN;
3453	} else
3454		cm->cm_sge = NULL;
3455	cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
3456
3457	cm->cm_complete_data = params;
3458	if (params->callback != NULL) {
3459		cm->cm_complete = mpr_config_complete;
3460		return (mpr_map_command(sc, cm));
3461	} else {
3462		error = mpr_wait_command(sc, &cm, 0, CAN_SLEEP);
3463		if (error) {
3464			mpr_dprint(sc, MPR_FAULT,
3465			    "Error %d reading config page\n", error);
3466			if (cm != NULL)
3467				mpr_free_command(sc, cm);
3468			return (error);
3469		}
3470		mpr_config_complete(sc, cm);
3471	}
3472
3473	return (0);
3474}
3475
3476int
3477mpr_write_config_page(struct mpr_softc *sc, struct mpr_config_params *params)
3478{
3479	return (EINVAL);
3480}
3481
3482static void
3483mpr_config_complete(struct mpr_softc *sc, struct mpr_command *cm)
3484{
3485	MPI2_CONFIG_REPLY *reply;
3486	struct mpr_config_params *params;
3487
3488	MPR_FUNCTRACE(sc);
3489	params = cm->cm_complete_data;
3490
3491	if (cm->cm_data != NULL) {
3492		bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap,
3493		    BUS_DMASYNC_POSTREAD);
3494		bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
3495	}
3496
3497	/*
3498	 * XXX KDM need to do more error recovery?  This results in the
3499	 * device in question not getting probed.
3500	 */
3501	if ((cm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) {
3502		params->status = MPI2_IOCSTATUS_BUSY;
3503		goto done;
3504	}
3505
3506	reply = (MPI2_CONFIG_REPLY *)cm->cm_reply;
3507	if (reply == NULL) {
3508		params->status = MPI2_IOCSTATUS_BUSY;
3509		goto done;
3510	}
3511	params->status = reply->IOCStatus;
3512	if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
3513		params->hdr.Ext.ExtPageType = reply->ExtPageType;
3514		params->hdr.Ext.ExtPageLength = reply->ExtPageLength;
3515		params->hdr.Ext.PageType = reply->Header.PageType;
3516		params->hdr.Ext.PageNumber = reply->Header.PageNumber;
3517		params->hdr.Ext.PageVersion = reply->Header.PageVersion;
3518	} else {
3519		params->hdr.Struct.PageType = reply->Header.PageType;
3520		params->hdr.Struct.PageNumber = reply->Header.PageNumber;
3521		params->hdr.Struct.PageLength = reply->Header.PageLength;
3522		params->hdr.Struct.PageVersion = reply->Header.PageVersion;
3523	}
3524
3525done:
3526	mpr_free_command(sc, cm);
3527	if (params->callback != NULL)
3528		params->callback(sc, params);
3529
3530	return;
3531}
3532