1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2/*
3 * Copyright(c) 2015 - 2018 Intel Corporation.
4 */
5
6#include <linux/spinlock.h>
7#include <linux/seqlock.h>
8#include <linux/netdevice.h>
9#include <linux/moduleparam.h>
10#include <linux/bitops.h>
11#include <linux/timer.h>
12#include <linux/vmalloc.h>
13#include <linux/highmem.h>
14
15#include "hfi.h"
16#include "common.h"
17#include "qp.h"
18#include "sdma.h"
19#include "iowait.h"
20#include "trace.h"
21
22/* must be a power of 2 >= 64 <= 32768 */
23#define SDMA_DESCQ_CNT 2048
24#define SDMA_DESC_INTR 64
25#define INVALID_TAIL 0xffff
26#define SDMA_PAD max_t(size_t, MAX_16B_PADDING, sizeof(u32))
27
28static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
29module_param(sdma_descq_cnt, uint, S_IRUGO);
30MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
31
32static uint sdma_idle_cnt = 250;
33module_param(sdma_idle_cnt, uint, S_IRUGO);
34MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
35
36uint mod_num_sdma;
37module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
38MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
39
40static uint sdma_desct_intr = SDMA_DESC_INTR;
41module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR);
42MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt");
43
44#define SDMA_WAIT_BATCH_SIZE 20
45/* max wait time for a SDMA engine to indicate it has halted */
46#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
47/* all SDMA engine errors that cause a halt */
48
49#define SD(name) SEND_DMA_##name
50#define ALL_SDMA_ENG_HALT_ERRS \
51	(SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
52	| SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
53	| SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
54	| SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
55	| SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
56	| SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
57	| SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
58	| SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
59	| SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
60	| SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
61	| SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
62	| SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
63	| SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
64	| SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
65	| SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
66	| SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
67	| SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
68	| SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
69
70/* sdma_sendctrl operations */
71#define SDMA_SENDCTRL_OP_ENABLE    BIT(0)
72#define SDMA_SENDCTRL_OP_INTENABLE BIT(1)
73#define SDMA_SENDCTRL_OP_HALT      BIT(2)
74#define SDMA_SENDCTRL_OP_CLEANUP   BIT(3)
75
76/* handle long defines */
77#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
78SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
79#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
80SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
81
82static const char * const sdma_state_names[] = {
83	[sdma_state_s00_hw_down]                = "s00_HwDown",
84	[sdma_state_s10_hw_start_up_halt_wait]  = "s10_HwStartUpHaltWait",
85	[sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
86	[sdma_state_s20_idle]                   = "s20_Idle",
87	[sdma_state_s30_sw_clean_up_wait]       = "s30_SwCleanUpWait",
88	[sdma_state_s40_hw_clean_up_wait]       = "s40_HwCleanUpWait",
89	[sdma_state_s50_hw_halt_wait]           = "s50_HwHaltWait",
90	[sdma_state_s60_idle_halt_wait]         = "s60_IdleHaltWait",
91	[sdma_state_s80_hw_freeze]		= "s80_HwFreeze",
92	[sdma_state_s82_freeze_sw_clean]	= "s82_FreezeSwClean",
93	[sdma_state_s99_running]                = "s99_Running",
94};
95
96#ifdef CONFIG_SDMA_VERBOSITY
97static const char * const sdma_event_names[] = {
98	[sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
99	[sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
100	[sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
101	[sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
102	[sdma_event_e30_go_running]   = "e30_GoRunning",
103	[sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
104	[sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
105	[sdma_event_e60_hw_halted]    = "e60_HwHalted",
106	[sdma_event_e70_go_idle]      = "e70_GoIdle",
107	[sdma_event_e80_hw_freeze]    = "e80_HwFreeze",
108	[sdma_event_e81_hw_frozen]    = "e81_HwFrozen",
109	[sdma_event_e82_hw_unfreeze]  = "e82_HwUnfreeze",
110	[sdma_event_e85_link_down]    = "e85_LinkDown",
111	[sdma_event_e90_sw_halted]    = "e90_SwHalted",
112};
113#endif
114
115static const struct sdma_set_state_action sdma_action_table[] = {
116	[sdma_state_s00_hw_down] = {
117		.go_s99_running_tofalse = 1,
118		.op_enable = 0,
119		.op_intenable = 0,
120		.op_halt = 0,
121		.op_cleanup = 0,
122	},
123	[sdma_state_s10_hw_start_up_halt_wait] = {
124		.op_enable = 0,
125		.op_intenable = 0,
126		.op_halt = 1,
127		.op_cleanup = 0,
128	},
129	[sdma_state_s15_hw_start_up_clean_wait] = {
130		.op_enable = 0,
131		.op_intenable = 1,
132		.op_halt = 0,
133		.op_cleanup = 1,
134	},
135	[sdma_state_s20_idle] = {
136		.op_enable = 0,
137		.op_intenable = 1,
138		.op_halt = 0,
139		.op_cleanup = 0,
140	},
141	[sdma_state_s30_sw_clean_up_wait] = {
142		.op_enable = 0,
143		.op_intenable = 0,
144		.op_halt = 0,
145		.op_cleanup = 0,
146	},
147	[sdma_state_s40_hw_clean_up_wait] = {
148		.op_enable = 0,
149		.op_intenable = 0,
150		.op_halt = 0,
151		.op_cleanup = 1,
152	},
153	[sdma_state_s50_hw_halt_wait] = {
154		.op_enable = 0,
155		.op_intenable = 0,
156		.op_halt = 0,
157		.op_cleanup = 0,
158	},
159	[sdma_state_s60_idle_halt_wait] = {
160		.go_s99_running_tofalse = 1,
161		.op_enable = 0,
162		.op_intenable = 0,
163		.op_halt = 1,
164		.op_cleanup = 0,
165	},
166	[sdma_state_s80_hw_freeze] = {
167		.op_enable = 0,
168		.op_intenable = 0,
169		.op_halt = 0,
170		.op_cleanup = 0,
171	},
172	[sdma_state_s82_freeze_sw_clean] = {
173		.op_enable = 0,
174		.op_intenable = 0,
175		.op_halt = 0,
176		.op_cleanup = 0,
177	},
178	[sdma_state_s99_running] = {
179		.op_enable = 1,
180		.op_intenable = 1,
181		.op_halt = 0,
182		.op_cleanup = 0,
183		.go_s99_running_totrue = 1,
184	},
185};
186
187#define SDMA_TAIL_UPDATE_THRESH 0x1F
188
189/* declare all statics here rather than keep sorting */
190static void sdma_complete(struct kref *);
191static void sdma_finalput(struct sdma_state *);
192static void sdma_get(struct sdma_state *);
193static void sdma_hw_clean_up_task(struct tasklet_struct *);
194static void sdma_put(struct sdma_state *);
195static void sdma_set_state(struct sdma_engine *, enum sdma_states);
196static void sdma_start_hw_clean_up(struct sdma_engine *);
197static void sdma_sw_clean_up_task(struct tasklet_struct *);
198static void sdma_sendctrl(struct sdma_engine *, unsigned);
199static void init_sdma_regs(struct sdma_engine *, u32, uint);
200static void sdma_process_event(
201	struct sdma_engine *sde,
202	enum sdma_events event);
203static void __sdma_process_event(
204	struct sdma_engine *sde,
205	enum sdma_events event);
206static void dump_sdma_state(struct sdma_engine *sde);
207static void sdma_make_progress(struct sdma_engine *sde, u64 status);
208static void sdma_desc_avail(struct sdma_engine *sde, uint avail);
209static void sdma_flush_descq(struct sdma_engine *sde);
210
211/**
212 * sdma_state_name() - return state string from enum
213 * @state: state
214 */
215static const char *sdma_state_name(enum sdma_states state)
216{
217	return sdma_state_names[state];
218}
219
220static void sdma_get(struct sdma_state *ss)
221{
222	kref_get(&ss->kref);
223}
224
225static void sdma_complete(struct kref *kref)
226{
227	struct sdma_state *ss =
228		container_of(kref, struct sdma_state, kref);
229
230	complete(&ss->comp);
231}
232
233static void sdma_put(struct sdma_state *ss)
234{
235	kref_put(&ss->kref, sdma_complete);
236}
237
238static void sdma_finalput(struct sdma_state *ss)
239{
240	sdma_put(ss);
241	wait_for_completion(&ss->comp);
242}
243
244static inline void write_sde_csr(
245	struct sdma_engine *sde,
246	u32 offset0,
247	u64 value)
248{
249	write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
250}
251
252static inline u64 read_sde_csr(
253	struct sdma_engine *sde,
254	u32 offset0)
255{
256	return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
257}
258
259/*
260 * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
261 * sdma engine 'sde' to drop to 0.
262 */
263static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
264					int pause)
265{
266	u64 off = 8 * sde->this_idx;
267	struct hfi1_devdata *dd = sde->dd;
268	int lcnt = 0;
269	u64 reg_prev;
270	u64 reg = 0;
271
272	while (1) {
273		reg_prev = reg;
274		reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
275
276		reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
277		reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
278		if (reg == 0)
279			break;
280		/* counter is reest if accupancy count changes */
281		if (reg != reg_prev)
282			lcnt = 0;
283		if (lcnt++ > 500) {
284			/* timed out - bounce the link */
285			dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
286				   __func__, sde->this_idx, (u32)reg);
287			queue_work(dd->pport->link_wq,
288				   &dd->pport->link_bounce_work);
289			break;
290		}
291		udelay(1);
292	}
293}
294
295/*
296 * sdma_wait() - wait for packet egress to complete for all SDMA engines,
297 * and pause for credit return.
298 */
299void sdma_wait(struct hfi1_devdata *dd)
300{
301	int i;
302
303	for (i = 0; i < dd->num_sdma; i++) {
304		struct sdma_engine *sde = &dd->per_sdma[i];
305
306		sdma_wait_for_packet_egress(sde, 0);
307	}
308}
309
310static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
311{
312	u64 reg;
313
314	if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
315		return;
316	reg = cnt;
317	reg &= SD(DESC_CNT_CNT_MASK);
318	reg <<= SD(DESC_CNT_CNT_SHIFT);
319	write_sde_csr(sde, SD(DESC_CNT), reg);
320}
321
322static inline void complete_tx(struct sdma_engine *sde,
323			       struct sdma_txreq *tx,
324			       int res)
325{
326	/* protect against complete modifying */
327	struct iowait *wait = tx->wait;
328	callback_t complete = tx->complete;
329
330#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
331	trace_hfi1_sdma_out_sn(sde, tx->sn);
332	if (WARN_ON_ONCE(sde->head_sn != tx->sn))
333		dd_dev_err(sde->dd, "expected %llu got %llu\n",
334			   sde->head_sn, tx->sn);
335	sde->head_sn++;
336#endif
337	__sdma_txclean(sde->dd, tx);
338	if (complete)
339		(*complete)(tx, res);
340	if (iowait_sdma_dec(wait))
341		iowait_drain_wakeup(wait);
342}
343
344/*
345 * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
346 *
347 * Depending on timing there can be txreqs in two places:
348 * - in the descq ring
349 * - in the flush list
350 *
351 * To avoid ordering issues the descq ring needs to be flushed
352 * first followed by the flush list.
353 *
354 * This routine is called from two places
355 * - From a work queue item
356 * - Directly from the state machine just before setting the
357 *   state to running
358 *
359 * Must be called with head_lock held
360 *
361 */
362static void sdma_flush(struct sdma_engine *sde)
363{
364	struct sdma_txreq *txp, *txp_next;
365	LIST_HEAD(flushlist);
366	unsigned long flags;
367	uint seq;
368
369	/* flush from head to tail */
370	sdma_flush_descq(sde);
371	spin_lock_irqsave(&sde->flushlist_lock, flags);
372	/* copy flush list */
373	list_splice_init(&sde->flushlist, &flushlist);
374	spin_unlock_irqrestore(&sde->flushlist_lock, flags);
375	/* flush from flush list */
376	list_for_each_entry_safe(txp, txp_next, &flushlist, list)
377		complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
378	/* wakeup QPs orphaned on the dmawait list */
379	do {
380		struct iowait *w, *nw;
381
382		seq = read_seqbegin(&sde->waitlock);
383		if (!list_empty(&sde->dmawait)) {
384			write_seqlock(&sde->waitlock);
385			list_for_each_entry_safe(w, nw, &sde->dmawait, list) {
386				if (w->wakeup) {
387					w->wakeup(w, SDMA_AVAIL_REASON);
388					list_del_init(&w->list);
389				}
390			}
391			write_sequnlock(&sde->waitlock);
392		}
393	} while (read_seqretry(&sde->waitlock, seq));
394}
395
396/*
397 * Fields a work request for flushing the descq ring
398 * and the flush list
399 *
400 * If the engine has been brought to running during
401 * the scheduling delay, the flush is ignored, assuming
402 * that the process of bringing the engine to running
403 * would have done this flush prior to going to running.
404 *
405 */
406static void sdma_field_flush(struct work_struct *work)
407{
408	unsigned long flags;
409	struct sdma_engine *sde =
410		container_of(work, struct sdma_engine, flush_worker);
411
412	write_seqlock_irqsave(&sde->head_lock, flags);
413	if (!__sdma_running(sde))
414		sdma_flush(sde);
415	write_sequnlock_irqrestore(&sde->head_lock, flags);
416}
417
418static void sdma_err_halt_wait(struct work_struct *work)
419{
420	struct sdma_engine *sde = container_of(work, struct sdma_engine,
421						err_halt_worker);
422	u64 statuscsr;
423	unsigned long timeout;
424
425	timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
426	while (1) {
427		statuscsr = read_sde_csr(sde, SD(STATUS));
428		statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
429		if (statuscsr)
430			break;
431		if (time_after(jiffies, timeout)) {
432			dd_dev_err(sde->dd,
433				   "SDMA engine %d - timeout waiting for engine to halt\n",
434				   sde->this_idx);
435			/*
436			 * Continue anyway.  This could happen if there was
437			 * an uncorrectable error in the wrong spot.
438			 */
439			break;
440		}
441		usleep_range(80, 120);
442	}
443
444	sdma_process_event(sde, sdma_event_e15_hw_halt_done);
445}
446
447static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
448{
449	if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
450		unsigned index;
451		struct hfi1_devdata *dd = sde->dd;
452
453		for (index = 0; index < dd->num_sdma; index++) {
454			struct sdma_engine *curr_sdma = &dd->per_sdma[index];
455
456			if (curr_sdma != sde)
457				curr_sdma->progress_check_head =
458							curr_sdma->descq_head;
459		}
460		dd_dev_err(sde->dd,
461			   "SDMA engine %d - check scheduled\n",
462				sde->this_idx);
463		mod_timer(&sde->err_progress_check_timer, jiffies + 10);
464	}
465}
466
467static void sdma_err_progress_check(struct timer_list *t)
468{
469	unsigned index;
470	struct sdma_engine *sde = from_timer(sde, t, err_progress_check_timer);
471
472	dd_dev_err(sde->dd, "SDE progress check event\n");
473	for (index = 0; index < sde->dd->num_sdma; index++) {
474		struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
475		unsigned long flags;
476
477		/* check progress on each engine except the current one */
478		if (curr_sde == sde)
479			continue;
480		/*
481		 * We must lock interrupts when acquiring sde->lock,
482		 * to avoid a deadlock if interrupt triggers and spins on
483		 * the same lock on same CPU
484		 */
485		spin_lock_irqsave(&curr_sde->tail_lock, flags);
486		write_seqlock(&curr_sde->head_lock);
487
488		/* skip non-running queues */
489		if (curr_sde->state.current_state != sdma_state_s99_running) {
490			write_sequnlock(&curr_sde->head_lock);
491			spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
492			continue;
493		}
494
495		if ((curr_sde->descq_head != curr_sde->descq_tail) &&
496		    (curr_sde->descq_head ==
497				curr_sde->progress_check_head))
498			__sdma_process_event(curr_sde,
499					     sdma_event_e90_sw_halted);
500		write_sequnlock(&curr_sde->head_lock);
501		spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
502	}
503	schedule_work(&sde->err_halt_worker);
504}
505
506static void sdma_hw_clean_up_task(struct tasklet_struct *t)
507{
508	struct sdma_engine *sde = from_tasklet(sde, t,
509					       sdma_hw_clean_up_task);
510	u64 statuscsr;
511
512	while (1) {
513#ifdef CONFIG_SDMA_VERBOSITY
514		dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
515			   sde->this_idx, slashstrip(__FILE__), __LINE__,
516			__func__);
517#endif
518		statuscsr = read_sde_csr(sde, SD(STATUS));
519		statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
520		if (statuscsr)
521			break;
522		udelay(10);
523	}
524
525	sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
526}
527
528static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
529{
530	return sde->tx_ring[sde->tx_head & sde->sdma_mask];
531}
532
533/*
534 * flush ring for recovery
535 */
536static void sdma_flush_descq(struct sdma_engine *sde)
537{
538	u16 head, tail;
539	int progress = 0;
540	struct sdma_txreq *txp = get_txhead(sde);
541
542	/* The reason for some of the complexity of this code is that
543	 * not all descriptors have corresponding txps.  So, we have to
544	 * be able to skip over descs until we wander into the range of
545	 * the next txp on the list.
546	 */
547	head = sde->descq_head & sde->sdma_mask;
548	tail = sde->descq_tail & sde->sdma_mask;
549	while (head != tail) {
550		/* advance head, wrap if needed */
551		head = ++sde->descq_head & sde->sdma_mask;
552		/* if now past this txp's descs, do the callback */
553		if (txp && txp->next_descq_idx == head) {
554			/* remove from list */
555			sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
556			complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
557			trace_hfi1_sdma_progress(sde, head, tail, txp);
558			txp = get_txhead(sde);
559		}
560		progress++;
561	}
562	if (progress)
563		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
564}
565
566static void sdma_sw_clean_up_task(struct tasklet_struct *t)
567{
568	struct sdma_engine *sde = from_tasklet(sde, t, sdma_sw_clean_up_task);
569	unsigned long flags;
570
571	spin_lock_irqsave(&sde->tail_lock, flags);
572	write_seqlock(&sde->head_lock);
573
574	/*
575	 * At this point, the following should always be true:
576	 * - We are halted, so no more descriptors are getting retired.
577	 * - We are not running, so no one is submitting new work.
578	 * - Only we can send the e40_sw_cleaned, so we can't start
579	 *   running again until we say so.  So, the active list and
580	 *   descq are ours to play with.
581	 */
582
583	/*
584	 * In the error clean up sequence, software clean must be called
585	 * before the hardware clean so we can use the hardware head in
586	 * the progress routine.  A hardware clean or SPC unfreeze will
587	 * reset the hardware head.
588	 *
589	 * Process all retired requests. The progress routine will use the
590	 * latest physical hardware head - we are not running so speed does
591	 * not matter.
592	 */
593	sdma_make_progress(sde, 0);
594
595	sdma_flush(sde);
596
597	/*
598	 * Reset our notion of head and tail.
599	 * Note that the HW registers have been reset via an earlier
600	 * clean up.
601	 */
602	sde->descq_tail = 0;
603	sde->descq_head = 0;
604	sde->desc_avail = sdma_descq_freecnt(sde);
605	*sde->head_dma = 0;
606
607	__sdma_process_event(sde, sdma_event_e40_sw_cleaned);
608
609	write_sequnlock(&sde->head_lock);
610	spin_unlock_irqrestore(&sde->tail_lock, flags);
611}
612
613static void sdma_sw_tear_down(struct sdma_engine *sde)
614{
615	struct sdma_state *ss = &sde->state;
616
617	/* Releasing this reference means the state machine has stopped. */
618	sdma_put(ss);
619
620	/* stop waiting for all unfreeze events to complete */
621	atomic_set(&sde->dd->sdma_unfreeze_count, -1);
622	wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
623}
624
625static void sdma_start_hw_clean_up(struct sdma_engine *sde)
626{
627	tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
628}
629
630static void sdma_set_state(struct sdma_engine *sde,
631			   enum sdma_states next_state)
632{
633	struct sdma_state *ss = &sde->state;
634	const struct sdma_set_state_action *action = sdma_action_table;
635	unsigned op = 0;
636
637	trace_hfi1_sdma_state(
638		sde,
639		sdma_state_names[ss->current_state],
640		sdma_state_names[next_state]);
641
642	/* debugging bookkeeping */
643	ss->previous_state = ss->current_state;
644	ss->previous_op = ss->current_op;
645	ss->current_state = next_state;
646
647	if (ss->previous_state != sdma_state_s99_running &&
648	    next_state == sdma_state_s99_running)
649		sdma_flush(sde);
650
651	if (action[next_state].op_enable)
652		op |= SDMA_SENDCTRL_OP_ENABLE;
653
654	if (action[next_state].op_intenable)
655		op |= SDMA_SENDCTRL_OP_INTENABLE;
656
657	if (action[next_state].op_halt)
658		op |= SDMA_SENDCTRL_OP_HALT;
659
660	if (action[next_state].op_cleanup)
661		op |= SDMA_SENDCTRL_OP_CLEANUP;
662
663	if (action[next_state].go_s99_running_tofalse)
664		ss->go_s99_running = 0;
665
666	if (action[next_state].go_s99_running_totrue)
667		ss->go_s99_running = 1;
668
669	ss->current_op = op;
670	sdma_sendctrl(sde, ss->current_op);
671}
672
673/**
674 * sdma_get_descq_cnt() - called when device probed
675 *
676 * Return a validated descq count.
677 *
678 * This is currently only used in the verbs initialization to build the tx
679 * list.
680 *
681 * This will probably be deleted in favor of a more scalable approach to
682 * alloc tx's.
683 *
684 */
685u16 sdma_get_descq_cnt(void)
686{
687	u16 count = sdma_descq_cnt;
688
689	if (!count)
690		return SDMA_DESCQ_CNT;
691	/* count must be a power of 2 greater than 64 and less than
692	 * 32768.   Otherwise return default.
693	 */
694	if (!is_power_of_2(count))
695		return SDMA_DESCQ_CNT;
696	if (count < 64 || count > 32768)
697		return SDMA_DESCQ_CNT;
698	return count;
699}
700
701/**
702 * sdma_engine_get_vl() - return vl for a given sdma engine
703 * @sde: sdma engine
704 *
705 * This function returns the vl mapped to a given engine, or an error if
706 * the mapping can't be found. The mapping fields are protected by RCU.
707 */
708int sdma_engine_get_vl(struct sdma_engine *sde)
709{
710	struct hfi1_devdata *dd = sde->dd;
711	struct sdma_vl_map *m;
712	u8 vl;
713
714	if (sde->this_idx >= TXE_NUM_SDMA_ENGINES)
715		return -EINVAL;
716
717	rcu_read_lock();
718	m = rcu_dereference(dd->sdma_map);
719	if (unlikely(!m)) {
720		rcu_read_unlock();
721		return -EINVAL;
722	}
723	vl = m->engine_to_vl[sde->this_idx];
724	rcu_read_unlock();
725
726	return vl;
727}
728
729/**
730 * sdma_select_engine_vl() - select sdma engine
731 * @dd: devdata
732 * @selector: a spreading factor
733 * @vl: this vl
734 *
735 *
736 * This function returns an engine based on the selector and a vl.  The
737 * mapping fields are protected by RCU.
738 */
739struct sdma_engine *sdma_select_engine_vl(
740	struct hfi1_devdata *dd,
741	u32 selector,
742	u8 vl)
743{
744	struct sdma_vl_map *m;
745	struct sdma_map_elem *e;
746	struct sdma_engine *rval;
747
748	/* NOTE This should only happen if SC->VL changed after the initial
749	 *      checks on the QP/AH
750	 *      Default will return engine 0 below
751	 */
752	if (vl >= num_vls) {
753		rval = NULL;
754		goto done;
755	}
756
757	rcu_read_lock();
758	m = rcu_dereference(dd->sdma_map);
759	if (unlikely(!m)) {
760		rcu_read_unlock();
761		return &dd->per_sdma[0];
762	}
763	e = m->map[vl & m->mask];
764	rval = e->sde[selector & e->mask];
765	rcu_read_unlock();
766
767done:
768	rval =  !rval ? &dd->per_sdma[0] : rval;
769	trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
770	return rval;
771}
772
773/**
774 * sdma_select_engine_sc() - select sdma engine
775 * @dd: devdata
776 * @selector: a spreading factor
777 * @sc5: the 5 bit sc
778 *
779 *
780 * This function returns an engine based on the selector and an sc.
781 */
782struct sdma_engine *sdma_select_engine_sc(
783	struct hfi1_devdata *dd,
784	u32 selector,
785	u8 sc5)
786{
787	u8 vl = sc_to_vlt(dd, sc5);
788
789	return sdma_select_engine_vl(dd, selector, vl);
790}
791
792struct sdma_rht_map_elem {
793	u32 mask;
794	u8 ctr;
795	struct sdma_engine *sde[];
796};
797
798struct sdma_rht_node {
799	unsigned long cpu_id;
800	struct sdma_rht_map_elem *map[HFI1_MAX_VLS_SUPPORTED];
801	struct rhash_head node;
802};
803
804#define NR_CPUS_HINT 192
805
806static const struct rhashtable_params sdma_rht_params = {
807	.nelem_hint = NR_CPUS_HINT,
808	.head_offset = offsetof(struct sdma_rht_node, node),
809	.key_offset = offsetof(struct sdma_rht_node, cpu_id),
810	.key_len = sizeof_field(struct sdma_rht_node, cpu_id),
811	.max_size = NR_CPUS,
812	.min_size = 8,
813	.automatic_shrinking = true,
814};
815
816/*
817 * sdma_select_user_engine() - select sdma engine based on user setup
818 * @dd: devdata
819 * @selector: a spreading factor
820 * @vl: this vl
821 *
822 * This function returns an sdma engine for a user sdma request.
823 * User defined sdma engine affinity setting is honored when applicable,
824 * otherwise system default sdma engine mapping is used. To ensure correct
825 * ordering, the mapping from <selector, vl> to sde must remain unchanged.
826 */
827struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
828					    u32 selector, u8 vl)
829{
830	struct sdma_rht_node *rht_node;
831	struct sdma_engine *sde = NULL;
832	unsigned long cpu_id;
833
834	/*
835	 * To ensure that always the same sdma engine(s) will be
836	 * selected make sure the process is pinned to this CPU only.
837	 */
838	if (current->nr_cpus_allowed != 1)
839		goto out;
840
841	rcu_read_lock();
842	cpu_id = smp_processor_id();
843	rht_node = rhashtable_lookup(dd->sdma_rht, &cpu_id,
844				     sdma_rht_params);
845
846	if (rht_node && rht_node->map[vl]) {
847		struct sdma_rht_map_elem *map = rht_node->map[vl];
848
849		sde = map->sde[selector & map->mask];
850	}
851	rcu_read_unlock();
852
853	if (sde)
854		return sde;
855
856out:
857	return sdma_select_engine_vl(dd, selector, vl);
858}
859
860static void sdma_populate_sde_map(struct sdma_rht_map_elem *map)
861{
862	int i;
863
864	for (i = 0; i < roundup_pow_of_two(map->ctr ? : 1) - map->ctr; i++)
865		map->sde[map->ctr + i] = map->sde[i];
866}
867
868static void sdma_cleanup_sde_map(struct sdma_rht_map_elem *map,
869				 struct sdma_engine *sde)
870{
871	unsigned int i, pow;
872
873	/* only need to check the first ctr entries for a match */
874	for (i = 0; i < map->ctr; i++) {
875		if (map->sde[i] == sde) {
876			memmove(&map->sde[i], &map->sde[i + 1],
877				(map->ctr - i - 1) * sizeof(map->sde[0]));
878			map->ctr--;
879			pow = roundup_pow_of_two(map->ctr ? : 1);
880			map->mask = pow - 1;
881			sdma_populate_sde_map(map);
882			break;
883		}
884	}
885}
886
887/*
888 * Prevents concurrent reads and writes of the sdma engine cpu_mask
889 */
890static DEFINE_MUTEX(process_to_sde_mutex);
891
892ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf,
893				size_t count)
894{
895	struct hfi1_devdata *dd = sde->dd;
896	cpumask_var_t mask, new_mask;
897	unsigned long cpu;
898	int ret, vl, sz;
899	struct sdma_rht_node *rht_node;
900
901	vl = sdma_engine_get_vl(sde);
902	if (unlikely(vl < 0 || vl >= ARRAY_SIZE(rht_node->map)))
903		return -EINVAL;
904
905	ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
906	if (!ret)
907		return -ENOMEM;
908
909	ret = zalloc_cpumask_var(&new_mask, GFP_KERNEL);
910	if (!ret) {
911		free_cpumask_var(mask);
912		return -ENOMEM;
913	}
914	ret = cpulist_parse(buf, mask);
915	if (ret)
916		goto out_free;
917
918	if (!cpumask_subset(mask, cpu_online_mask)) {
919		dd_dev_warn(sde->dd, "Invalid CPU mask\n");
920		ret = -EINVAL;
921		goto out_free;
922	}
923
924	sz = sizeof(struct sdma_rht_map_elem) +
925			(TXE_NUM_SDMA_ENGINES * sizeof(struct sdma_engine *));
926
927	mutex_lock(&process_to_sde_mutex);
928
929	for_each_cpu(cpu, mask) {
930		/* Check if we have this already mapped */
931		if (cpumask_test_cpu(cpu, &sde->cpu_mask)) {
932			cpumask_set_cpu(cpu, new_mask);
933			continue;
934		}
935
936		rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu,
937						  sdma_rht_params);
938		if (!rht_node) {
939			rht_node = kzalloc(sizeof(*rht_node), GFP_KERNEL);
940			if (!rht_node) {
941				ret = -ENOMEM;
942				goto out;
943			}
944
945			rht_node->map[vl] = kzalloc(sz, GFP_KERNEL);
946			if (!rht_node->map[vl]) {
947				kfree(rht_node);
948				ret = -ENOMEM;
949				goto out;
950			}
951			rht_node->cpu_id = cpu;
952			rht_node->map[vl]->mask = 0;
953			rht_node->map[vl]->ctr = 1;
954			rht_node->map[vl]->sde[0] = sde;
955
956			ret = rhashtable_insert_fast(dd->sdma_rht,
957						     &rht_node->node,
958						     sdma_rht_params);
959			if (ret) {
960				kfree(rht_node->map[vl]);
961				kfree(rht_node);
962				dd_dev_err(sde->dd, "Failed to set process to sde affinity for cpu %lu\n",
963					   cpu);
964				goto out;
965			}
966
967		} else {
968			int ctr, pow;
969
970			/* Add new user mappings */
971			if (!rht_node->map[vl])
972				rht_node->map[vl] = kzalloc(sz, GFP_KERNEL);
973
974			if (!rht_node->map[vl]) {
975				ret = -ENOMEM;
976				goto out;
977			}
978
979			rht_node->map[vl]->ctr++;
980			ctr = rht_node->map[vl]->ctr;
981			rht_node->map[vl]->sde[ctr - 1] = sde;
982			pow = roundup_pow_of_two(ctr);
983			rht_node->map[vl]->mask = pow - 1;
984
985			/* Populate the sde map table */
986			sdma_populate_sde_map(rht_node->map[vl]);
987		}
988		cpumask_set_cpu(cpu, new_mask);
989	}
990
991	/* Clean up old mappings */
992	for_each_cpu(cpu, cpu_online_mask) {
993		struct sdma_rht_node *rht_node;
994
995		/* Don't cleanup sdes that are set in the new mask */
996		if (cpumask_test_cpu(cpu, mask))
997			continue;
998
999		rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu,
1000						  sdma_rht_params);
1001		if (rht_node) {
1002			bool empty = true;
1003			int i;
1004
1005			/* Remove mappings for old sde */
1006			for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
1007				if (rht_node->map[i])
1008					sdma_cleanup_sde_map(rht_node->map[i],
1009							     sde);
1010
1011			/* Free empty hash table entries */
1012			for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) {
1013				if (!rht_node->map[i])
1014					continue;
1015
1016				if (rht_node->map[i]->ctr) {
1017					empty = false;
1018					break;
1019				}
1020			}
1021
1022			if (empty) {
1023				ret = rhashtable_remove_fast(dd->sdma_rht,
1024							     &rht_node->node,
1025							     sdma_rht_params);
1026				WARN_ON(ret);
1027
1028				for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
1029					kfree(rht_node->map[i]);
1030
1031				kfree(rht_node);
1032			}
1033		}
1034	}
1035
1036	cpumask_copy(&sde->cpu_mask, new_mask);
1037out:
1038	mutex_unlock(&process_to_sde_mutex);
1039out_free:
1040	free_cpumask_var(mask);
1041	free_cpumask_var(new_mask);
1042	return ret ? : strnlen(buf, PAGE_SIZE);
1043}
1044
1045ssize_t sdma_get_cpu_to_sde_map(struct sdma_engine *sde, char *buf)
1046{
1047	mutex_lock(&process_to_sde_mutex);
1048	if (cpumask_empty(&sde->cpu_mask))
1049		snprintf(buf, PAGE_SIZE, "%s\n", "empty");
1050	else
1051		cpumap_print_to_pagebuf(true, buf, &sde->cpu_mask);
1052	mutex_unlock(&process_to_sde_mutex);
1053	return strnlen(buf, PAGE_SIZE);
1054}
1055
1056static void sdma_rht_free(void *ptr, void *arg)
1057{
1058	struct sdma_rht_node *rht_node = ptr;
1059	int i;
1060
1061	for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
1062		kfree(rht_node->map[i]);
1063
1064	kfree(rht_node);
1065}
1066
1067/**
1068 * sdma_seqfile_dump_cpu_list() - debugfs dump the cpu to sdma mappings
1069 * @s: seq file
1070 * @dd: hfi1_devdata
1071 * @cpuid: cpu id
1072 *
1073 * This routine dumps the process to sde mappings per cpu
1074 */
1075void sdma_seqfile_dump_cpu_list(struct seq_file *s,
1076				struct hfi1_devdata *dd,
1077				unsigned long cpuid)
1078{
1079	struct sdma_rht_node *rht_node;
1080	int i, j;
1081
1082	rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpuid,
1083					  sdma_rht_params);
1084	if (!rht_node)
1085		return;
1086
1087	seq_printf(s, "cpu%3lu: ", cpuid);
1088	for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) {
1089		if (!rht_node->map[i] || !rht_node->map[i]->ctr)
1090			continue;
1091
1092		seq_printf(s, " vl%d: [", i);
1093
1094		for (j = 0; j < rht_node->map[i]->ctr; j++) {
1095			if (!rht_node->map[i]->sde[j])
1096				continue;
1097
1098			if (j > 0)
1099				seq_puts(s, ",");
1100
1101			seq_printf(s, " sdma%2d",
1102				   rht_node->map[i]->sde[j]->this_idx);
1103		}
1104		seq_puts(s, " ]");
1105	}
1106
1107	seq_puts(s, "\n");
1108}
1109
1110/*
1111 * Free the indicated map struct
1112 */
1113static void sdma_map_free(struct sdma_vl_map *m)
1114{
1115	int i;
1116
1117	for (i = 0; m && i < m->actual_vls; i++)
1118		kfree(m->map[i]);
1119	kfree(m);
1120}
1121
1122/*
1123 * Handle RCU callback
1124 */
1125static void sdma_map_rcu_callback(struct rcu_head *list)
1126{
1127	struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
1128
1129	sdma_map_free(m);
1130}
1131
1132/**
1133 * sdma_map_init - called when # vls change
1134 * @dd: hfi1_devdata
1135 * @port: port number
1136 * @num_vls: number of vls
1137 * @vl_engines: per vl engine mapping (optional)
1138 *
1139 * This routine changes the mapping based on the number of vls.
1140 *
1141 * vl_engines is used to specify a non-uniform vl/engine loading. NULL
1142 * implies auto computing the loading and giving each VLs a uniform
1143 * distribution of engines per VL.
1144 *
1145 * The auto algorithm computes the sde_per_vl and the number of extra
1146 * engines.  Any extra engines are added from the last VL on down.
1147 *
1148 * rcu locking is used here to control access to the mapping fields.
1149 *
1150 * If either the num_vls or num_sdma are non-power of 2, the array sizes
1151 * in the struct sdma_vl_map and the struct sdma_map_elem are rounded
1152 * up to the next highest power of 2 and the first entry is reused
1153 * in a round robin fashion.
1154 *
1155 * If an error occurs the map change is not done and the mapping is
1156 * not changed.
1157 *
1158 */
1159int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
1160{
1161	int i, j;
1162	int extra, sde_per_vl;
1163	int engine = 0;
1164	u8 lvl_engines[OPA_MAX_VLS];
1165	struct sdma_vl_map *oldmap, *newmap;
1166
1167	if (!(dd->flags & HFI1_HAS_SEND_DMA))
1168		return 0;
1169
1170	if (!vl_engines) {
1171		/* truncate divide */
1172		sde_per_vl = dd->num_sdma / num_vls;
1173		/* extras */
1174		extra = dd->num_sdma % num_vls;
1175		vl_engines = lvl_engines;
1176		/* add extras from last vl down */
1177		for (i = num_vls - 1; i >= 0; i--, extra--)
1178			vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
1179	}
1180	/* build new map */
1181	newmap = kzalloc(
1182		sizeof(struct sdma_vl_map) +
1183			roundup_pow_of_two(num_vls) *
1184			sizeof(struct sdma_map_elem *),
1185		GFP_KERNEL);
1186	if (!newmap)
1187		goto bail;
1188	newmap->actual_vls = num_vls;
1189	newmap->vls = roundup_pow_of_two(num_vls);
1190	newmap->mask = (1 << ilog2(newmap->vls)) - 1;
1191	/* initialize back-map */
1192	for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++)
1193		newmap->engine_to_vl[i] = -1;
1194	for (i = 0; i < newmap->vls; i++) {
1195		/* save for wrap around */
1196		int first_engine = engine;
1197
1198		if (i < newmap->actual_vls) {
1199			int sz = roundup_pow_of_two(vl_engines[i]);
1200
1201			/* only allocate once */
1202			newmap->map[i] = kzalloc(
1203				sizeof(struct sdma_map_elem) +
1204					sz * sizeof(struct sdma_engine *),
1205				GFP_KERNEL);
1206			if (!newmap->map[i])
1207				goto bail;
1208			newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
1209			/* assign engines */
1210			for (j = 0; j < sz; j++) {
1211				newmap->map[i]->sde[j] =
1212					&dd->per_sdma[engine];
1213				if (++engine >= first_engine + vl_engines[i])
1214					/* wrap back to first engine */
1215					engine = first_engine;
1216			}
1217			/* assign back-map */
1218			for (j = 0; j < vl_engines[i]; j++)
1219				newmap->engine_to_vl[first_engine + j] = i;
1220		} else {
1221			/* just re-use entry without allocating */
1222			newmap->map[i] = newmap->map[i % num_vls];
1223		}
1224		engine = first_engine + vl_engines[i];
1225	}
1226	/* newmap in hand, save old map */
1227	spin_lock_irq(&dd->sde_map_lock);
1228	oldmap = rcu_dereference_protected(dd->sdma_map,
1229					   lockdep_is_held(&dd->sde_map_lock));
1230
1231	/* publish newmap */
1232	rcu_assign_pointer(dd->sdma_map, newmap);
1233
1234	spin_unlock_irq(&dd->sde_map_lock);
1235	/* success, free any old map after grace period */
1236	if (oldmap)
1237		call_rcu(&oldmap->list, sdma_map_rcu_callback);
1238	return 0;
1239bail:
1240	/* free any partial allocation */
1241	sdma_map_free(newmap);
1242	return -ENOMEM;
1243}
1244
1245/**
1246 * sdma_clean - Clean up allocated memory
1247 * @dd:          struct hfi1_devdata
1248 * @num_engines: num sdma engines
1249 *
1250 * This routine can be called regardless of the success of
1251 * sdma_init()
1252 */
1253void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
1254{
1255	size_t i;
1256	struct sdma_engine *sde;
1257
1258	if (dd->sdma_pad_dma) {
1259		dma_free_coherent(&dd->pcidev->dev, SDMA_PAD,
1260				  (void *)dd->sdma_pad_dma,
1261				  dd->sdma_pad_phys);
1262		dd->sdma_pad_dma = NULL;
1263		dd->sdma_pad_phys = 0;
1264	}
1265	if (dd->sdma_heads_dma) {
1266		dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
1267				  (void *)dd->sdma_heads_dma,
1268				  dd->sdma_heads_phys);
1269		dd->sdma_heads_dma = NULL;
1270		dd->sdma_heads_phys = 0;
1271	}
1272	for (i = 0; dd->per_sdma && i < num_engines; ++i) {
1273		sde = &dd->per_sdma[i];
1274
1275		sde->head_dma = NULL;
1276		sde->head_phys = 0;
1277
1278		if (sde->descq) {
1279			dma_free_coherent(
1280				&dd->pcidev->dev,
1281				sde->descq_cnt * sizeof(u64[2]),
1282				sde->descq,
1283				sde->descq_phys
1284			);
1285			sde->descq = NULL;
1286			sde->descq_phys = 0;
1287		}
1288		kvfree(sde->tx_ring);
1289		sde->tx_ring = NULL;
1290	}
1291	if (rcu_access_pointer(dd->sdma_map)) {
1292		spin_lock_irq(&dd->sde_map_lock);
1293		sdma_map_free(rcu_access_pointer(dd->sdma_map));
1294		RCU_INIT_POINTER(dd->sdma_map, NULL);
1295		spin_unlock_irq(&dd->sde_map_lock);
1296		synchronize_rcu();
1297	}
1298	kfree(dd->per_sdma);
1299	dd->per_sdma = NULL;
1300
1301	if (dd->sdma_rht) {
1302		rhashtable_free_and_destroy(dd->sdma_rht, sdma_rht_free, NULL);
1303		kfree(dd->sdma_rht);
1304		dd->sdma_rht = NULL;
1305	}
1306}
1307
1308/**
1309 * sdma_init() - called when device probed
1310 * @dd: hfi1_devdata
1311 * @port: port number (currently only zero)
1312 *
1313 * Initializes each sde and its csrs.
1314 * Interrupts are not required to be enabled.
1315 *
1316 * Returns:
1317 * 0 - success, -errno on failure
1318 */
1319int sdma_init(struct hfi1_devdata *dd, u8 port)
1320{
1321	unsigned this_idx;
1322	struct sdma_engine *sde;
1323	struct rhashtable *tmp_sdma_rht;
1324	u16 descq_cnt;
1325	void *curr_head;
1326	struct hfi1_pportdata *ppd = dd->pport + port;
1327	u32 per_sdma_credits;
1328	uint idle_cnt = sdma_idle_cnt;
1329	size_t num_engines = chip_sdma_engines(dd);
1330	int ret = -ENOMEM;
1331
1332	if (!HFI1_CAP_IS_KSET(SDMA)) {
1333		HFI1_CAP_CLEAR(SDMA_AHG);
1334		return 0;
1335	}
1336	if (mod_num_sdma &&
1337	    /* can't exceed chip support */
1338	    mod_num_sdma <= chip_sdma_engines(dd) &&
1339	    /* count must be >= vls */
1340	    mod_num_sdma >= num_vls)
1341		num_engines = mod_num_sdma;
1342
1343	dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
1344	dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", chip_sdma_engines(dd));
1345	dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
1346		    chip_sdma_mem_size(dd));
1347
1348	per_sdma_credits =
1349		chip_sdma_mem_size(dd) / (num_engines * SDMA_BLOCK_SIZE);
1350
1351	/* set up freeze waitqueue */
1352	init_waitqueue_head(&dd->sdma_unfreeze_wq);
1353	atomic_set(&dd->sdma_unfreeze_count, 0);
1354
1355	descq_cnt = sdma_get_descq_cnt();
1356	dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
1357		    num_engines, descq_cnt);
1358
1359	/* alloc memory for array of send engines */
1360	dd->per_sdma = kcalloc_node(num_engines, sizeof(*dd->per_sdma),
1361				    GFP_KERNEL, dd->node);
1362	if (!dd->per_sdma)
1363		return ret;
1364
1365	idle_cnt = ns_to_cclock(dd, idle_cnt);
1366	if (idle_cnt)
1367		dd->default_desc1 =
1368			SDMA_DESC1_HEAD_TO_HOST_FLAG;
1369	else
1370		dd->default_desc1 =
1371			SDMA_DESC1_INT_REQ_FLAG;
1372
1373	if (!sdma_desct_intr)
1374		sdma_desct_intr = SDMA_DESC_INTR;
1375
1376	/* Allocate memory for SendDMA descriptor FIFOs */
1377	for (this_idx = 0; this_idx < num_engines; ++this_idx) {
1378		sde = &dd->per_sdma[this_idx];
1379		sde->dd = dd;
1380		sde->ppd = ppd;
1381		sde->this_idx = this_idx;
1382		sde->descq_cnt = descq_cnt;
1383		sde->desc_avail = sdma_descq_freecnt(sde);
1384		sde->sdma_shift = ilog2(descq_cnt);
1385		sde->sdma_mask = (1 << sde->sdma_shift) - 1;
1386
1387		/* Create a mask specifically for each interrupt source */
1388		sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES +
1389					   this_idx);
1390		sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES +
1391						this_idx);
1392		sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES +
1393					    this_idx);
1394		/* Create a combined mask to cover all 3 interrupt sources */
1395		sde->imask = sde->int_mask | sde->progress_mask |
1396			     sde->idle_mask;
1397
1398		spin_lock_init(&sde->tail_lock);
1399		seqlock_init(&sde->head_lock);
1400		spin_lock_init(&sde->senddmactrl_lock);
1401		spin_lock_init(&sde->flushlist_lock);
1402		seqlock_init(&sde->waitlock);
1403		/* insure there is always a zero bit */
1404		sde->ahg_bits = 0xfffffffe00000000ULL;
1405
1406		sdma_set_state(sde, sdma_state_s00_hw_down);
1407
1408		/* set up reference counting */
1409		kref_init(&sde->state.kref);
1410		init_completion(&sde->state.comp);
1411
1412		INIT_LIST_HEAD(&sde->flushlist);
1413		INIT_LIST_HEAD(&sde->dmawait);
1414
1415		sde->tail_csr =
1416			get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
1417
1418		tasklet_setup(&sde->sdma_hw_clean_up_task,
1419			      sdma_hw_clean_up_task);
1420		tasklet_setup(&sde->sdma_sw_clean_up_task,
1421			      sdma_sw_clean_up_task);
1422		INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
1423		INIT_WORK(&sde->flush_worker, sdma_field_flush);
1424
1425		sde->progress_check_head = 0;
1426
1427		timer_setup(&sde->err_progress_check_timer,
1428			    sdma_err_progress_check, 0);
1429
1430		sde->descq = dma_alloc_coherent(&dd->pcidev->dev,
1431						descq_cnt * sizeof(u64[2]),
1432						&sde->descq_phys, GFP_KERNEL);
1433		if (!sde->descq)
1434			goto bail;
1435		sde->tx_ring =
1436			kvzalloc_node(array_size(descq_cnt,
1437						 sizeof(struct sdma_txreq *)),
1438				      GFP_KERNEL, dd->node);
1439		if (!sde->tx_ring)
1440			goto bail;
1441	}
1442
1443	dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
1444	/* Allocate memory for DMA of head registers to memory */
1445	dd->sdma_heads_dma = dma_alloc_coherent(&dd->pcidev->dev,
1446						dd->sdma_heads_size,
1447						&dd->sdma_heads_phys,
1448						GFP_KERNEL);
1449	if (!dd->sdma_heads_dma) {
1450		dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
1451		goto bail;
1452	}
1453
1454	/* Allocate memory for pad */
1455	dd->sdma_pad_dma = dma_alloc_coherent(&dd->pcidev->dev, SDMA_PAD,
1456					      &dd->sdma_pad_phys, GFP_KERNEL);
1457	if (!dd->sdma_pad_dma) {
1458		dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
1459		goto bail;
1460	}
1461
1462	/* assign each engine to different cacheline and init registers */
1463	curr_head = (void *)dd->sdma_heads_dma;
1464	for (this_idx = 0; this_idx < num_engines; ++this_idx) {
1465		unsigned long phys_offset;
1466
1467		sde = &dd->per_sdma[this_idx];
1468
1469		sde->head_dma = curr_head;
1470		curr_head += L1_CACHE_BYTES;
1471		phys_offset = (unsigned long)sde->head_dma -
1472			      (unsigned long)dd->sdma_heads_dma;
1473		sde->head_phys = dd->sdma_heads_phys + phys_offset;
1474		init_sdma_regs(sde, per_sdma_credits, idle_cnt);
1475	}
1476	dd->flags |= HFI1_HAS_SEND_DMA;
1477	dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
1478	dd->num_sdma = num_engines;
1479	ret = sdma_map_init(dd, port, ppd->vls_operational, NULL);
1480	if (ret < 0)
1481		goto bail;
1482
1483	tmp_sdma_rht = kzalloc(sizeof(*tmp_sdma_rht), GFP_KERNEL);
1484	if (!tmp_sdma_rht) {
1485		ret = -ENOMEM;
1486		goto bail;
1487	}
1488
1489	ret = rhashtable_init(tmp_sdma_rht, &sdma_rht_params);
1490	if (ret < 0) {
1491		kfree(tmp_sdma_rht);
1492		goto bail;
1493	}
1494
1495	dd->sdma_rht = tmp_sdma_rht;
1496
1497	dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
1498	return 0;
1499
1500bail:
1501	sdma_clean(dd, num_engines);
1502	return ret;
1503}
1504
1505/**
1506 * sdma_all_running() - called when the link goes up
1507 * @dd: hfi1_devdata
1508 *
1509 * This routine moves all engines to the running state.
1510 */
1511void sdma_all_running(struct hfi1_devdata *dd)
1512{
1513	struct sdma_engine *sde;
1514	unsigned int i;
1515
1516	/* move all engines to running */
1517	for (i = 0; i < dd->num_sdma; ++i) {
1518		sde = &dd->per_sdma[i];
1519		sdma_process_event(sde, sdma_event_e30_go_running);
1520	}
1521}
1522
1523/**
1524 * sdma_all_idle() - called when the link goes down
1525 * @dd: hfi1_devdata
1526 *
1527 * This routine moves all engines to the idle state.
1528 */
1529void sdma_all_idle(struct hfi1_devdata *dd)
1530{
1531	struct sdma_engine *sde;
1532	unsigned int i;
1533
1534	/* idle all engines */
1535	for (i = 0; i < dd->num_sdma; ++i) {
1536		sde = &dd->per_sdma[i];
1537		sdma_process_event(sde, sdma_event_e70_go_idle);
1538	}
1539}
1540
1541/**
1542 * sdma_start() - called to kick off state processing for all engines
1543 * @dd: hfi1_devdata
1544 *
1545 * This routine is for kicking off the state processing for all required
1546 * sdma engines.  Interrupts need to be working at this point.
1547 *
1548 */
1549void sdma_start(struct hfi1_devdata *dd)
1550{
1551	unsigned i;
1552	struct sdma_engine *sde;
1553
1554	/* kick off the engines state processing */
1555	for (i = 0; i < dd->num_sdma; ++i) {
1556		sde = &dd->per_sdma[i];
1557		sdma_process_event(sde, sdma_event_e10_go_hw_start);
1558	}
1559}
1560
1561/**
1562 * sdma_exit() - used when module is removed
1563 * @dd: hfi1_devdata
1564 */
1565void sdma_exit(struct hfi1_devdata *dd)
1566{
1567	unsigned this_idx;
1568	struct sdma_engine *sde;
1569
1570	for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
1571			++this_idx) {
1572		sde = &dd->per_sdma[this_idx];
1573		if (!list_empty(&sde->dmawait))
1574			dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
1575				   sde->this_idx);
1576		sdma_process_event(sde, sdma_event_e00_go_hw_down);
1577
1578		del_timer_sync(&sde->err_progress_check_timer);
1579
1580		/*
1581		 * This waits for the state machine to exit so it is not
1582		 * necessary to kill the sdma_sw_clean_up_task to make sure
1583		 * it is not running.
1584		 */
1585		sdma_finalput(&sde->state);
1586	}
1587}
1588
1589/*
1590 * unmap the indicated descriptor
1591 */
1592static inline void sdma_unmap_desc(
1593	struct hfi1_devdata *dd,
1594	struct sdma_desc *descp)
1595{
1596	switch (sdma_mapping_type(descp)) {
1597	case SDMA_MAP_SINGLE:
1598		dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp),
1599				 sdma_mapping_len(descp), DMA_TO_DEVICE);
1600		break;
1601	case SDMA_MAP_PAGE:
1602		dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp),
1603			       sdma_mapping_len(descp), DMA_TO_DEVICE);
1604		break;
1605	}
1606
1607	if (descp->pinning_ctx && descp->ctx_put)
1608		descp->ctx_put(descp->pinning_ctx);
1609	descp->pinning_ctx = NULL;
1610}
1611
1612/*
1613 * return the mode as indicated by the first
1614 * descriptor in the tx.
1615 */
1616static inline u8 ahg_mode(struct sdma_txreq *tx)
1617{
1618	return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
1619		>> SDMA_DESC1_HEADER_MODE_SHIFT;
1620}
1621
1622/**
1623 * __sdma_txclean() - clean tx of mappings, descp *kmalloc's
1624 * @dd: hfi1_devdata for unmapping
1625 * @tx: tx request to clean
1626 *
1627 * This is used in the progress routine to clean the tx or
1628 * by the ULP to toss an in-process tx build.
1629 *
1630 * The code can be called multiple times without issue.
1631 *
1632 */
1633void __sdma_txclean(
1634	struct hfi1_devdata *dd,
1635	struct sdma_txreq *tx)
1636{
1637	u16 i;
1638
1639	if (tx->num_desc) {
1640		u8 skip = 0, mode = ahg_mode(tx);
1641
1642		/* unmap first */
1643		sdma_unmap_desc(dd, &tx->descp[0]);
1644		/* determine number of AHG descriptors to skip */
1645		if (mode > SDMA_AHG_APPLY_UPDATE1)
1646			skip = mode >> 1;
1647		for (i = 1 + skip; i < tx->num_desc; i++)
1648			sdma_unmap_desc(dd, &tx->descp[i]);
1649		tx->num_desc = 0;
1650	}
1651	kfree(tx->coalesce_buf);
1652	tx->coalesce_buf = NULL;
1653	/* kmalloc'ed descp */
1654	if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
1655		tx->desc_limit = ARRAY_SIZE(tx->descs);
1656		kfree(tx->descp);
1657	}
1658}
1659
1660static inline u16 sdma_gethead(struct sdma_engine *sde)
1661{
1662	struct hfi1_devdata *dd = sde->dd;
1663	int use_dmahead;
1664	u16 hwhead;
1665
1666#ifdef CONFIG_SDMA_VERBOSITY
1667	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
1668		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
1669#endif
1670
1671retry:
1672	use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
1673					(dd->flags & HFI1_HAS_SDMA_TIMEOUT);
1674	hwhead = use_dmahead ?
1675		(u16)le64_to_cpu(*sde->head_dma) :
1676		(u16)read_sde_csr(sde, SD(HEAD));
1677
1678	if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
1679		u16 cnt;
1680		u16 swtail;
1681		u16 swhead;
1682		int sane;
1683
1684		swhead = sde->descq_head & sde->sdma_mask;
1685		/* this code is really bad for cache line trading */
1686		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
1687		cnt = sde->descq_cnt;
1688
1689		if (swhead < swtail)
1690			/* not wrapped */
1691			sane = (hwhead >= swhead) & (hwhead <= swtail);
1692		else if (swhead > swtail)
1693			/* wrapped around */
1694			sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
1695				(hwhead <= swtail);
1696		else
1697			/* empty */
1698			sane = (hwhead == swhead);
1699
1700		if (unlikely(!sane)) {
1701			dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%u swhd=%u swtl=%u cnt=%u\n",
1702				   sde->this_idx,
1703				   use_dmahead ? "dma" : "kreg",
1704				   hwhead, swhead, swtail, cnt);
1705			if (use_dmahead) {
1706				/* try one more time, using csr */
1707				use_dmahead = 0;
1708				goto retry;
1709			}
1710			/* proceed as if no progress */
1711			hwhead = swhead;
1712		}
1713	}
1714	return hwhead;
1715}
1716
1717/*
1718 * This is called when there are send DMA descriptors that might be
1719 * available.
1720 *
1721 * This is called with head_lock held.
1722 */
1723static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
1724{
1725	struct iowait *wait, *nw, *twait;
1726	struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
1727	uint i, n = 0, seq, tidx = 0;
1728
1729#ifdef CONFIG_SDMA_VERBOSITY
1730	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
1731		   slashstrip(__FILE__), __LINE__, __func__);
1732	dd_dev_err(sde->dd, "avail: %u\n", avail);
1733#endif
1734
1735	do {
1736		seq = read_seqbegin(&sde->waitlock);
1737		if (!list_empty(&sde->dmawait)) {
1738			/* at least one item */
1739			write_seqlock(&sde->waitlock);
1740			/* Harvest waiters wanting DMA descriptors */
1741			list_for_each_entry_safe(
1742					wait,
1743					nw,
1744					&sde->dmawait,
1745					list) {
1746				u32 num_desc;
1747
1748				if (!wait->wakeup)
1749					continue;
1750				if (n == ARRAY_SIZE(waits))
1751					break;
1752				iowait_init_priority(wait);
1753				num_desc = iowait_get_all_desc(wait);
1754				if (num_desc > avail)
1755					break;
1756				avail -= num_desc;
1757				/* Find the top-priority wait memeber */
1758				if (n) {
1759					twait = waits[tidx];
1760					tidx =
1761					    iowait_priority_update_top(wait,
1762								       twait,
1763								       n,
1764								       tidx);
1765				}
1766				list_del_init(&wait->list);
1767				waits[n++] = wait;
1768			}
1769			write_sequnlock(&sde->waitlock);
1770			break;
1771		}
1772	} while (read_seqretry(&sde->waitlock, seq));
1773
1774	/* Schedule the top-priority entry first */
1775	if (n)
1776		waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
1777
1778	for (i = 0; i < n; i++)
1779		if (i != tidx)
1780			waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
1781}
1782
1783/* head_lock must be held */
1784static void sdma_make_progress(struct sdma_engine *sde, u64 status)
1785{
1786	struct sdma_txreq *txp = NULL;
1787	int progress = 0;
1788	u16 hwhead, swhead;
1789	int idle_check_done = 0;
1790
1791	hwhead = sdma_gethead(sde);
1792
1793	/* The reason for some of the complexity of this code is that
1794	 * not all descriptors have corresponding txps.  So, we have to
1795	 * be able to skip over descs until we wander into the range of
1796	 * the next txp on the list.
1797	 */
1798
1799retry:
1800	txp = get_txhead(sde);
1801	swhead = sde->descq_head & sde->sdma_mask;
1802	trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
1803	while (swhead != hwhead) {
1804		/* advance head, wrap if needed */
1805		swhead = ++sde->descq_head & sde->sdma_mask;
1806
1807		/* if now past this txp's descs, do the callback */
1808		if (txp && txp->next_descq_idx == swhead) {
1809			/* remove from list */
1810			sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
1811			complete_tx(sde, txp, SDMA_TXREQ_S_OK);
1812			/* see if there is another txp */
1813			txp = get_txhead(sde);
1814		}
1815		trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
1816		progress++;
1817	}
1818
1819	/*
1820	 * The SDMA idle interrupt is not guaranteed to be ordered with respect
1821	 * to updates to the dma_head location in host memory. The head
1822	 * value read might not be fully up to date. If there are pending
1823	 * descriptors and the SDMA idle interrupt fired then read from the
1824	 * CSR SDMA head instead to get the latest value from the hardware.
1825	 * The hardware SDMA head should be read at most once in this invocation
1826	 * of sdma_make_progress(..) which is ensured by idle_check_done flag
1827	 */
1828	if ((status & sde->idle_mask) && !idle_check_done) {
1829		u16 swtail;
1830
1831		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
1832		if (swtail != hwhead) {
1833			hwhead = (u16)read_sde_csr(sde, SD(HEAD));
1834			idle_check_done = 1;
1835			goto retry;
1836		}
1837	}
1838
1839	sde->last_status = status;
1840	if (progress)
1841		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
1842}
1843
1844/*
1845 * sdma_engine_interrupt() - interrupt handler for engine
1846 * @sde: sdma engine
1847 * @status: sdma interrupt reason
1848 *
1849 * Status is a mask of the 3 possible interrupts for this engine.  It will
1850 * contain bits _only_ for this SDMA engine.  It will contain at least one
1851 * bit, it may contain more.
1852 */
1853void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
1854{
1855	trace_hfi1_sdma_engine_interrupt(sde, status);
1856	write_seqlock(&sde->head_lock);
1857	sdma_set_desc_cnt(sde, sdma_desct_intr);
1858	if (status & sde->idle_mask)
1859		sde->idle_int_cnt++;
1860	else if (status & sde->progress_mask)
1861		sde->progress_int_cnt++;
1862	else if (status & sde->int_mask)
1863		sde->sdma_int_cnt++;
1864	sdma_make_progress(sde, status);
1865	write_sequnlock(&sde->head_lock);
1866}
1867
1868/**
1869 * sdma_engine_error() - error handler for engine
1870 * @sde: sdma engine
1871 * @status: sdma interrupt reason
1872 */
1873void sdma_engine_error(struct sdma_engine *sde, u64 status)
1874{
1875	unsigned long flags;
1876
1877#ifdef CONFIG_SDMA_VERBOSITY
1878	dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
1879		   sde->this_idx,
1880		   (unsigned long long)status,
1881		   sdma_state_names[sde->state.current_state]);
1882#endif
1883	spin_lock_irqsave(&sde->tail_lock, flags);
1884	write_seqlock(&sde->head_lock);
1885	if (status & ALL_SDMA_ENG_HALT_ERRS)
1886		__sdma_process_event(sde, sdma_event_e60_hw_halted);
1887	if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
1888		dd_dev_err(sde->dd,
1889			   "SDMA (%u) engine error: 0x%llx state %s\n",
1890			   sde->this_idx,
1891			   (unsigned long long)status,
1892			   sdma_state_names[sde->state.current_state]);
1893		dump_sdma_state(sde);
1894	}
1895	write_sequnlock(&sde->head_lock);
1896	spin_unlock_irqrestore(&sde->tail_lock, flags);
1897}
1898
1899static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
1900{
1901	u64 set_senddmactrl = 0;
1902	u64 clr_senddmactrl = 0;
1903	unsigned long flags;
1904
1905#ifdef CONFIG_SDMA_VERBOSITY
1906	dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
1907		   sde->this_idx,
1908		   (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
1909		   (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
1910		   (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
1911		   (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
1912#endif
1913
1914	if (op & SDMA_SENDCTRL_OP_ENABLE)
1915		set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
1916	else
1917		clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
1918
1919	if (op & SDMA_SENDCTRL_OP_INTENABLE)
1920		set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
1921	else
1922		clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
1923
1924	if (op & SDMA_SENDCTRL_OP_HALT)
1925		set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
1926	else
1927		clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
1928
1929	spin_lock_irqsave(&sde->senddmactrl_lock, flags);
1930
1931	sde->p_senddmactrl |= set_senddmactrl;
1932	sde->p_senddmactrl &= ~clr_senddmactrl;
1933
1934	if (op & SDMA_SENDCTRL_OP_CLEANUP)
1935		write_sde_csr(sde, SD(CTRL),
1936			      sde->p_senddmactrl |
1937			      SD(CTRL_SDMA_CLEANUP_SMASK));
1938	else
1939		write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
1940
1941	spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
1942
1943#ifdef CONFIG_SDMA_VERBOSITY
1944	sdma_dumpstate(sde);
1945#endif
1946}
1947
1948static void sdma_setlengen(struct sdma_engine *sde)
1949{
1950#ifdef CONFIG_SDMA_VERBOSITY
1951	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
1952		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
1953#endif
1954
1955	/*
1956	 * Set SendDmaLenGen and clear-then-set the MSB of the generation
1957	 * count to enable generation checking and load the internal
1958	 * generation counter.
1959	 */
1960	write_sde_csr(sde, SD(LEN_GEN),
1961		      (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT));
1962	write_sde_csr(sde, SD(LEN_GEN),
1963		      ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) |
1964		      (4ULL << SD(LEN_GEN_GENERATION_SHIFT)));
1965}
1966
1967static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
1968{
1969	/* Commit writes to memory and advance the tail on the chip */
1970	smp_wmb(); /* see get_txhead() */
1971	writeq(tail, sde->tail_csr);
1972}
1973
1974/*
1975 * This is called when changing to state s10_hw_start_up_halt_wait as
1976 * a result of send buffer errors or send DMA descriptor errors.
1977 */
1978static void sdma_hw_start_up(struct sdma_engine *sde)
1979{
1980	u64 reg;
1981
1982#ifdef CONFIG_SDMA_VERBOSITY
1983	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
1984		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
1985#endif
1986
1987	sdma_setlengen(sde);
1988	sdma_update_tail(sde, 0); /* Set SendDmaTail */
1989	*sde->head_dma = 0;
1990
1991	reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
1992	      SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
1993	write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
1994}
1995
1996/*
1997 * set_sdma_integrity
1998 *
1999 * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
2000 */
2001static void set_sdma_integrity(struct sdma_engine *sde)
2002{
2003	struct hfi1_devdata *dd = sde->dd;
2004
2005	write_sde_csr(sde, SD(CHECK_ENABLE),
2006		      hfi1_pkt_base_sdma_integrity(dd));
2007}
2008
2009static void init_sdma_regs(
2010	struct sdma_engine *sde,
2011	u32 credits,
2012	uint idle_cnt)
2013{
2014	u8 opval, opmask;
2015#ifdef CONFIG_SDMA_VERBOSITY
2016	struct hfi1_devdata *dd = sde->dd;
2017
2018	dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
2019		   sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
2020#endif
2021
2022	write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
2023	sdma_setlengen(sde);
2024	sdma_update_tail(sde, 0); /* Set SendDmaTail */
2025	write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
2026	write_sde_csr(sde, SD(DESC_CNT), 0);
2027	write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
2028	write_sde_csr(sde, SD(MEMORY),
2029		      ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
2030		      ((u64)(credits * sde->this_idx) <<
2031		       SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
2032	write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
2033	set_sdma_integrity(sde);
2034	opmask = OPCODE_CHECK_MASK_DISABLED;
2035	opval = OPCODE_CHECK_VAL_DISABLED;
2036	write_sde_csr(sde, SD(CHECK_OPCODE),
2037		      (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
2038		      (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
2039}
2040
2041#ifdef CONFIG_SDMA_VERBOSITY
2042
2043#define sdma_dumpstate_helper0(reg) do { \
2044		csr = read_csr(sde->dd, reg); \
2045		dd_dev_err(sde->dd, "%36s     0x%016llx\n", #reg, csr); \
2046	} while (0)
2047
2048#define sdma_dumpstate_helper(reg) do { \
2049		csr = read_sde_csr(sde, reg); \
2050		dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
2051			#reg, sde->this_idx, csr); \
2052	} while (0)
2053
2054#define sdma_dumpstate_helper2(reg) do { \
2055		csr = read_csr(sde->dd, reg + (8 * i)); \
2056		dd_dev_err(sde->dd, "%33s_%02u     0x%016llx\n", \
2057				#reg, i, csr); \
2058	} while (0)
2059
2060void sdma_dumpstate(struct sdma_engine *sde)
2061{
2062	u64 csr;
2063	unsigned i;
2064
2065	sdma_dumpstate_helper(SD(CTRL));
2066	sdma_dumpstate_helper(SD(STATUS));
2067	sdma_dumpstate_helper0(SD(ERR_STATUS));
2068	sdma_dumpstate_helper0(SD(ERR_MASK));
2069	sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
2070	sdma_dumpstate_helper(SD(ENG_ERR_MASK));
2071
2072	for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
2073		sdma_dumpstate_helper2(CCE_INT_STATUS);
2074		sdma_dumpstate_helper2(CCE_INT_MASK);
2075		sdma_dumpstate_helper2(CCE_INT_BLOCKED);
2076	}
2077
2078	sdma_dumpstate_helper(SD(TAIL));
2079	sdma_dumpstate_helper(SD(HEAD));
2080	sdma_dumpstate_helper(SD(PRIORITY_THLD));
2081	sdma_dumpstate_helper(SD(IDLE_CNT));
2082	sdma_dumpstate_helper(SD(RELOAD_CNT));
2083	sdma_dumpstate_helper(SD(DESC_CNT));
2084	sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
2085	sdma_dumpstate_helper(SD(MEMORY));
2086	sdma_dumpstate_helper0(SD(ENGINES));
2087	sdma_dumpstate_helper0(SD(MEM_SIZE));
2088	/* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS);  */
2089	sdma_dumpstate_helper(SD(BASE_ADDR));
2090	sdma_dumpstate_helper(SD(LEN_GEN));
2091	sdma_dumpstate_helper(SD(HEAD_ADDR));
2092	sdma_dumpstate_helper(SD(CHECK_ENABLE));
2093	sdma_dumpstate_helper(SD(CHECK_VL));
2094	sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
2095	sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
2096	sdma_dumpstate_helper(SD(CHECK_SLID));
2097	sdma_dumpstate_helper(SD(CHECK_OPCODE));
2098}
2099#endif
2100
2101static void dump_sdma_state(struct sdma_engine *sde)
2102{
2103	struct hw_sdma_desc *descqp;
2104	u64 desc[2];
2105	u64 addr;
2106	u8 gen;
2107	u16 len;
2108	u16 head, tail, cnt;
2109
2110	head = sde->descq_head & sde->sdma_mask;
2111	tail = sde->descq_tail & sde->sdma_mask;
2112	cnt = sdma_descq_freecnt(sde);
2113
2114	dd_dev_err(sde->dd,
2115		   "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
2116		   sde->this_idx, head, tail, cnt,
2117		   !list_empty(&sde->flushlist));
2118
2119	/* print info for each entry in the descriptor queue */
2120	while (head != tail) {
2121		char flags[6] = { 'x', 'x', 'x', 'x', 0 };
2122
2123		descqp = &sde->descq[head];
2124		desc[0] = le64_to_cpu(descqp->qw[0]);
2125		desc[1] = le64_to_cpu(descqp->qw[1]);
2126		flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
2127		flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
2128				'H' : '-';
2129		flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
2130		flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
2131		addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
2132			& SDMA_DESC0_PHY_ADDR_MASK;
2133		gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
2134			& SDMA_DESC1_GENERATION_MASK;
2135		len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
2136			& SDMA_DESC0_BYTE_COUNT_MASK;
2137		dd_dev_err(sde->dd,
2138			   "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
2139			   head, flags, addr, gen, len);
2140		dd_dev_err(sde->dd,
2141			   "\tdesc0:0x%016llx desc1 0x%016llx\n",
2142			   desc[0], desc[1]);
2143		if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
2144			dd_dev_err(sde->dd,
2145				   "\taidx: %u amode: %u alen: %u\n",
2146				   (u8)((desc[1] &
2147					 SDMA_DESC1_HEADER_INDEX_SMASK) >>
2148					SDMA_DESC1_HEADER_INDEX_SHIFT),
2149				   (u8)((desc[1] &
2150					 SDMA_DESC1_HEADER_MODE_SMASK) >>
2151					SDMA_DESC1_HEADER_MODE_SHIFT),
2152				   (u8)((desc[1] &
2153					 SDMA_DESC1_HEADER_DWS_SMASK) >>
2154					SDMA_DESC1_HEADER_DWS_SHIFT));
2155		head++;
2156		head &= sde->sdma_mask;
2157	}
2158}
2159
2160#define SDE_FMT \
2161	"SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
2162/**
2163 * sdma_seqfile_dump_sde() - debugfs dump of sde
2164 * @s: seq file
2165 * @sde: send dma engine to dump
2166 *
2167 * This routine dumps the sde to the indicated seq file.
2168 */
2169void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
2170{
2171	u16 head, tail;
2172	struct hw_sdma_desc *descqp;
2173	u64 desc[2];
2174	u64 addr;
2175	u8 gen;
2176	u16 len;
2177
2178	head = sde->descq_head & sde->sdma_mask;
2179	tail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
2180	seq_printf(s, SDE_FMT, sde->this_idx,
2181		   sde->cpu,
2182		   sdma_state_name(sde->state.current_state),
2183		   (unsigned long long)read_sde_csr(sde, SD(CTRL)),
2184		   (unsigned long long)read_sde_csr(sde, SD(STATUS)),
2185		   (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)),
2186		   (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail,
2187		   (unsigned long long)read_sde_csr(sde, SD(HEAD)), head,
2188		   (unsigned long long)le64_to_cpu(*sde->head_dma),
2189		   (unsigned long long)read_sde_csr(sde, SD(MEMORY)),
2190		   (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
2191		   (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
2192		   (unsigned long long)sde->last_status,
2193		   (unsigned long long)sde->ahg_bits,
2194		   sde->tx_tail,
2195		   sde->tx_head,
2196		   sde->descq_tail,
2197		   sde->descq_head,
2198		   !list_empty(&sde->flushlist),
2199		   sde->descq_full_count,
2200		   (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
2201
2202	/* print info for each entry in the descriptor queue */
2203	while (head != tail) {
2204		char flags[6] = { 'x', 'x', 'x', 'x', 0 };
2205
2206		descqp = &sde->descq[head];
2207		desc[0] = le64_to_cpu(descqp->qw[0]);
2208		desc[1] = le64_to_cpu(descqp->qw[1]);
2209		flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
2210		flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
2211				'H' : '-';
2212		flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
2213		flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
2214		addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
2215			& SDMA_DESC0_PHY_ADDR_MASK;
2216		gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
2217			& SDMA_DESC1_GENERATION_MASK;
2218		len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
2219			& SDMA_DESC0_BYTE_COUNT_MASK;
2220		seq_printf(s,
2221			   "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
2222			   head, flags, addr, gen, len);
2223		if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
2224			seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
2225				   (u8)((desc[1] &
2226					 SDMA_DESC1_HEADER_INDEX_SMASK) >>
2227					SDMA_DESC1_HEADER_INDEX_SHIFT),
2228				   (u8)((desc[1] &
2229					 SDMA_DESC1_HEADER_MODE_SMASK) >>
2230					SDMA_DESC1_HEADER_MODE_SHIFT));
2231		head = (head + 1) & sde->sdma_mask;
2232	}
2233}
2234
2235/*
2236 * add the generation number into
2237 * the qw1 and return
2238 */
2239static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
2240{
2241	u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
2242
2243	qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
2244	qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
2245			<< SDMA_DESC1_GENERATION_SHIFT;
2246	return qw1;
2247}
2248
2249/*
2250 * This routine submits the indicated tx
2251 *
2252 * Space has already been guaranteed and
2253 * tail side of ring is locked.
2254 *
2255 * The hardware tail update is done
2256 * in the caller and that is facilitated
2257 * by returning the new tail.
2258 *
2259 * There is special case logic for ahg
2260 * to not add the generation number for
2261 * up to 2 descriptors that follow the
2262 * first descriptor.
2263 *
2264 */
2265static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
2266{
2267	int i;
2268	u16 tail;
2269	struct sdma_desc *descp = tx->descp;
2270	u8 skip = 0, mode = ahg_mode(tx);
2271
2272	tail = sde->descq_tail & sde->sdma_mask;
2273	sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
2274	sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
2275	trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
2276				   tail, &sde->descq[tail]);
2277	tail = ++sde->descq_tail & sde->sdma_mask;
2278	descp++;
2279	if (mode > SDMA_AHG_APPLY_UPDATE1)
2280		skip = mode >> 1;
2281	for (i = 1; i < tx->num_desc; i++, descp++) {
2282		u64 qw1;
2283
2284		sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
2285		if (skip) {
2286			/* edits don't have generation */
2287			qw1 = descp->qw[1];
2288			skip--;
2289		} else {
2290			/* replace generation with real one for non-edits */
2291			qw1 = add_gen(sde, descp->qw[1]);
2292		}
2293		sde->descq[tail].qw[1] = cpu_to_le64(qw1);
2294		trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
2295					   tail, &sde->descq[tail]);
2296		tail = ++sde->descq_tail & sde->sdma_mask;
2297	}
2298	tx->next_descq_idx = tail;
2299#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
2300	tx->sn = sde->tail_sn++;
2301	trace_hfi1_sdma_in_sn(sde, tx->sn);
2302	WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
2303#endif
2304	sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
2305	sde->desc_avail -= tx->num_desc;
2306	return tail;
2307}
2308
2309/*
2310 * Check for progress
2311 */
2312static int sdma_check_progress(
2313	struct sdma_engine *sde,
2314	struct iowait_work *wait,
2315	struct sdma_txreq *tx,
2316	bool pkts_sent)
2317{
2318	int ret;
2319
2320	sde->desc_avail = sdma_descq_freecnt(sde);
2321	if (tx->num_desc <= sde->desc_avail)
2322		return -EAGAIN;
2323	/* pulse the head_lock */
2324	if (wait && iowait_ioww_to_iow(wait)->sleep) {
2325		unsigned seq;
2326
2327		seq = raw_seqcount_begin(
2328			(const seqcount_t *)&sde->head_lock.seqcount);
2329		ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent);
2330		if (ret == -EAGAIN)
2331			sde->desc_avail = sdma_descq_freecnt(sde);
2332	} else {
2333		ret = -EBUSY;
2334	}
2335	return ret;
2336}
2337
2338/**
2339 * sdma_send_txreq() - submit a tx req to ring
2340 * @sde: sdma engine to use
2341 * @wait: SE wait structure to use when full (may be NULL)
2342 * @tx: sdma_txreq to submit
2343 * @pkts_sent: has any packet been sent yet?
2344 *
2345 * The call submits the tx into the ring.  If a iowait structure is non-NULL
2346 * the packet will be queued to the list in wait.
2347 *
2348 * Return:
2349 * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
2350 * ring (wait == NULL)
2351 * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
2352 */
2353int sdma_send_txreq(struct sdma_engine *sde,
2354		    struct iowait_work *wait,
2355		    struct sdma_txreq *tx,
2356		    bool pkts_sent)
2357{
2358	int ret = 0;
2359	u16 tail;
2360	unsigned long flags;
2361
2362	/* user should have supplied entire packet */
2363	if (unlikely(tx->tlen))
2364		return -EINVAL;
2365	tx->wait = iowait_ioww_to_iow(wait);
2366	spin_lock_irqsave(&sde->tail_lock, flags);
2367retry:
2368	if (unlikely(!__sdma_running(sde)))
2369		goto unlock_noconn;
2370	if (unlikely(tx->num_desc > sde->desc_avail))
2371		goto nodesc;
2372	tail = submit_tx(sde, tx);
2373	if (wait)
2374		iowait_sdma_inc(iowait_ioww_to_iow(wait));
2375	sdma_update_tail(sde, tail);
2376unlock:
2377	spin_unlock_irqrestore(&sde->tail_lock, flags);
2378	return ret;
2379unlock_noconn:
2380	if (wait)
2381		iowait_sdma_inc(iowait_ioww_to_iow(wait));
2382	tx->next_descq_idx = 0;
2383#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
2384	tx->sn = sde->tail_sn++;
2385	trace_hfi1_sdma_in_sn(sde, tx->sn);
2386#endif
2387	spin_lock(&sde->flushlist_lock);
2388	list_add_tail(&tx->list, &sde->flushlist);
2389	spin_unlock(&sde->flushlist_lock);
2390	iowait_inc_wait_count(wait, tx->num_desc);
2391	queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
2392	ret = -ECOMM;
2393	goto unlock;
2394nodesc:
2395	ret = sdma_check_progress(sde, wait, tx, pkts_sent);
2396	if (ret == -EAGAIN) {
2397		ret = 0;
2398		goto retry;
2399	}
2400	sde->descq_full_count++;
2401	goto unlock;
2402}
2403
2404/**
2405 * sdma_send_txlist() - submit a list of tx req to ring
2406 * @sde: sdma engine to use
2407 * @wait: SE wait structure to use when full (may be NULL)
2408 * @tx_list: list of sdma_txreqs to submit
2409 * @count_out: pointer to a u16 which, after return will contain the total number of
2410 *             sdma_txreqs removed from the tx_list. This will include sdma_txreqs
2411 *             whose SDMA descriptors are submitted to the ring and the sdma_txreqs
2412 *             which are added to SDMA engine flush list if the SDMA engine state is
2413 *             not running.
2414 *
2415 * The call submits the list into the ring.
2416 *
2417 * If the iowait structure is non-NULL and not equal to the iowait list
2418 * the unprocessed part of the list  will be appended to the list in wait.
2419 *
2420 * In all cases, the tx_list will be updated so the head of the tx_list is
2421 * the list of descriptors that have yet to be transmitted.
2422 *
2423 * The intent of this call is to provide a more efficient
2424 * way of submitting multiple packets to SDMA while holding the tail
2425 * side locking.
2426 *
2427 * Return:
2428 * 0 - Success,
2429 * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
2430 * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
2431 */
2432int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait,
2433		     struct list_head *tx_list, u16 *count_out)
2434{
2435	struct sdma_txreq *tx, *tx_next;
2436	int ret = 0;
2437	unsigned long flags;
2438	u16 tail = INVALID_TAIL;
2439	u32 submit_count = 0, flush_count = 0, total_count;
2440
2441	spin_lock_irqsave(&sde->tail_lock, flags);
2442retry:
2443	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
2444		tx->wait = iowait_ioww_to_iow(wait);
2445		if (unlikely(!__sdma_running(sde)))
2446			goto unlock_noconn;
2447		if (unlikely(tx->num_desc > sde->desc_avail))
2448			goto nodesc;
2449		if (unlikely(tx->tlen)) {
2450			ret = -EINVAL;
2451			goto update_tail;
2452		}
2453		list_del_init(&tx->list);
2454		tail = submit_tx(sde, tx);
2455		submit_count++;
2456		if (tail != INVALID_TAIL &&
2457		    (submit_count & SDMA_TAIL_UPDATE_THRESH) == 0) {
2458			sdma_update_tail(sde, tail);
2459			tail = INVALID_TAIL;
2460		}
2461	}
2462update_tail:
2463	total_count = submit_count + flush_count;
2464	if (wait) {
2465		iowait_sdma_add(iowait_ioww_to_iow(wait), total_count);
2466		iowait_starve_clear(submit_count > 0,
2467				    iowait_ioww_to_iow(wait));
2468	}
2469	if (tail != INVALID_TAIL)
2470		sdma_update_tail(sde, tail);
2471	spin_unlock_irqrestore(&sde->tail_lock, flags);
2472	*count_out = total_count;
2473	return ret;
2474unlock_noconn:
2475	spin_lock(&sde->flushlist_lock);
2476	list_for_each_entry_safe(tx, tx_next, tx_list, list) {
2477		tx->wait = iowait_ioww_to_iow(wait);
2478		list_del_init(&tx->list);
2479		tx->next_descq_idx = 0;
2480#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
2481		tx->sn = sde->tail_sn++;
2482		trace_hfi1_sdma_in_sn(sde, tx->sn);
2483#endif
2484		list_add_tail(&tx->list, &sde->flushlist);
2485		flush_count++;
2486		iowait_inc_wait_count(wait, tx->num_desc);
2487	}
2488	spin_unlock(&sde->flushlist_lock);
2489	queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
2490	ret = -ECOMM;
2491	goto update_tail;
2492nodesc:
2493	ret = sdma_check_progress(sde, wait, tx, submit_count > 0);
2494	if (ret == -EAGAIN) {
2495		ret = 0;
2496		goto retry;
2497	}
2498	sde->descq_full_count++;
2499	goto update_tail;
2500}
2501
2502static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event)
2503{
2504	unsigned long flags;
2505
2506	spin_lock_irqsave(&sde->tail_lock, flags);
2507	write_seqlock(&sde->head_lock);
2508
2509	__sdma_process_event(sde, event);
2510
2511	if (sde->state.current_state == sdma_state_s99_running)
2512		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
2513
2514	write_sequnlock(&sde->head_lock);
2515	spin_unlock_irqrestore(&sde->tail_lock, flags);
2516}
2517
2518static void __sdma_process_event(struct sdma_engine *sde,
2519				 enum sdma_events event)
2520{
2521	struct sdma_state *ss = &sde->state;
2522	int need_progress = 0;
2523
2524	/* CONFIG SDMA temporary */
2525#ifdef CONFIG_SDMA_VERBOSITY
2526	dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
2527		   sdma_state_names[ss->current_state],
2528		   sdma_event_names[event]);
2529#endif
2530
2531	switch (ss->current_state) {
2532	case sdma_state_s00_hw_down:
2533		switch (event) {
2534		case sdma_event_e00_go_hw_down:
2535			break;
2536		case sdma_event_e30_go_running:
2537			/*
2538			 * If down, but running requested (usually result
2539			 * of link up, then we need to start up.
2540			 * This can happen when hw down is requested while
2541			 * bringing the link up with traffic active on
2542			 * 7220, e.g.
2543			 */
2544			ss->go_s99_running = 1;
2545			fallthrough;	/* and start dma engine */
2546		case sdma_event_e10_go_hw_start:
2547			/* This reference means the state machine is started */
2548			sdma_get(&sde->state);
2549			sdma_set_state(sde,
2550				       sdma_state_s10_hw_start_up_halt_wait);
2551			break;
2552		case sdma_event_e15_hw_halt_done:
2553			break;
2554		case sdma_event_e25_hw_clean_up_done:
2555			break;
2556		case sdma_event_e40_sw_cleaned:
2557			sdma_sw_tear_down(sde);
2558			break;
2559		case sdma_event_e50_hw_cleaned:
2560			break;
2561		case sdma_event_e60_hw_halted:
2562			break;
2563		case sdma_event_e70_go_idle:
2564			break;
2565		case sdma_event_e80_hw_freeze:
2566			break;
2567		case sdma_event_e81_hw_frozen:
2568			break;
2569		case sdma_event_e82_hw_unfreeze:
2570			break;
2571		case sdma_event_e85_link_down:
2572			break;
2573		case sdma_event_e90_sw_halted:
2574			break;
2575		}
2576		break;
2577
2578	case sdma_state_s10_hw_start_up_halt_wait:
2579		switch (event) {
2580		case sdma_event_e00_go_hw_down:
2581			sdma_set_state(sde, sdma_state_s00_hw_down);
2582			sdma_sw_tear_down(sde);
2583			break;
2584		case sdma_event_e10_go_hw_start:
2585			break;
2586		case sdma_event_e15_hw_halt_done:
2587			sdma_set_state(sde,
2588				       sdma_state_s15_hw_start_up_clean_wait);
2589			sdma_start_hw_clean_up(sde);
2590			break;
2591		case sdma_event_e25_hw_clean_up_done:
2592			break;
2593		case sdma_event_e30_go_running:
2594			ss->go_s99_running = 1;
2595			break;
2596		case sdma_event_e40_sw_cleaned:
2597			break;
2598		case sdma_event_e50_hw_cleaned:
2599			break;
2600		case sdma_event_e60_hw_halted:
2601			schedule_work(&sde->err_halt_worker);
2602			break;
2603		case sdma_event_e70_go_idle:
2604			ss->go_s99_running = 0;
2605			break;
2606		case sdma_event_e80_hw_freeze:
2607			break;
2608		case sdma_event_e81_hw_frozen:
2609			break;
2610		case sdma_event_e82_hw_unfreeze:
2611			break;
2612		case sdma_event_e85_link_down:
2613			break;
2614		case sdma_event_e90_sw_halted:
2615			break;
2616		}
2617		break;
2618
2619	case sdma_state_s15_hw_start_up_clean_wait:
2620		switch (event) {
2621		case sdma_event_e00_go_hw_down:
2622			sdma_set_state(sde, sdma_state_s00_hw_down);
2623			sdma_sw_tear_down(sde);
2624			break;
2625		case sdma_event_e10_go_hw_start:
2626			break;
2627		case sdma_event_e15_hw_halt_done:
2628			break;
2629		case sdma_event_e25_hw_clean_up_done:
2630			sdma_hw_start_up(sde);
2631			sdma_set_state(sde, ss->go_s99_running ?
2632				       sdma_state_s99_running :
2633				       sdma_state_s20_idle);
2634			break;
2635		case sdma_event_e30_go_running:
2636			ss->go_s99_running = 1;
2637			break;
2638		case sdma_event_e40_sw_cleaned:
2639			break;
2640		case sdma_event_e50_hw_cleaned:
2641			break;
2642		case sdma_event_e60_hw_halted:
2643			break;
2644		case sdma_event_e70_go_idle:
2645			ss->go_s99_running = 0;
2646			break;
2647		case sdma_event_e80_hw_freeze:
2648			break;
2649		case sdma_event_e81_hw_frozen:
2650			break;
2651		case sdma_event_e82_hw_unfreeze:
2652			break;
2653		case sdma_event_e85_link_down:
2654			break;
2655		case sdma_event_e90_sw_halted:
2656			break;
2657		}
2658		break;
2659
2660	case sdma_state_s20_idle:
2661		switch (event) {
2662		case sdma_event_e00_go_hw_down:
2663			sdma_set_state(sde, sdma_state_s00_hw_down);
2664			sdma_sw_tear_down(sde);
2665			break;
2666		case sdma_event_e10_go_hw_start:
2667			break;
2668		case sdma_event_e15_hw_halt_done:
2669			break;
2670		case sdma_event_e25_hw_clean_up_done:
2671			break;
2672		case sdma_event_e30_go_running:
2673			sdma_set_state(sde, sdma_state_s99_running);
2674			ss->go_s99_running = 1;
2675			break;
2676		case sdma_event_e40_sw_cleaned:
2677			break;
2678		case sdma_event_e50_hw_cleaned:
2679			break;
2680		case sdma_event_e60_hw_halted:
2681			sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
2682			schedule_work(&sde->err_halt_worker);
2683			break;
2684		case sdma_event_e70_go_idle:
2685			break;
2686		case sdma_event_e85_link_down:
2687		case sdma_event_e80_hw_freeze:
2688			sdma_set_state(sde, sdma_state_s80_hw_freeze);
2689			atomic_dec(&sde->dd->sdma_unfreeze_count);
2690			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
2691			break;
2692		case sdma_event_e81_hw_frozen:
2693			break;
2694		case sdma_event_e82_hw_unfreeze:
2695			break;
2696		case sdma_event_e90_sw_halted:
2697			break;
2698		}
2699		break;
2700
2701	case sdma_state_s30_sw_clean_up_wait:
2702		switch (event) {
2703		case sdma_event_e00_go_hw_down:
2704			sdma_set_state(sde, sdma_state_s00_hw_down);
2705			break;
2706		case sdma_event_e10_go_hw_start:
2707			break;
2708		case sdma_event_e15_hw_halt_done:
2709			break;
2710		case sdma_event_e25_hw_clean_up_done:
2711			break;
2712		case sdma_event_e30_go_running:
2713			ss->go_s99_running = 1;
2714			break;
2715		case sdma_event_e40_sw_cleaned:
2716			sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
2717			sdma_start_hw_clean_up(sde);
2718			break;
2719		case sdma_event_e50_hw_cleaned:
2720			break;
2721		case sdma_event_e60_hw_halted:
2722			break;
2723		case sdma_event_e70_go_idle:
2724			ss->go_s99_running = 0;
2725			break;
2726		case sdma_event_e80_hw_freeze:
2727			break;
2728		case sdma_event_e81_hw_frozen:
2729			break;
2730		case sdma_event_e82_hw_unfreeze:
2731			break;
2732		case sdma_event_e85_link_down:
2733			ss->go_s99_running = 0;
2734			break;
2735		case sdma_event_e90_sw_halted:
2736			break;
2737		}
2738		break;
2739
2740	case sdma_state_s40_hw_clean_up_wait:
2741		switch (event) {
2742		case sdma_event_e00_go_hw_down:
2743			sdma_set_state(sde, sdma_state_s00_hw_down);
2744			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
2745			break;
2746		case sdma_event_e10_go_hw_start:
2747			break;
2748		case sdma_event_e15_hw_halt_done:
2749			break;
2750		case sdma_event_e25_hw_clean_up_done:
2751			sdma_hw_start_up(sde);
2752			sdma_set_state(sde, ss->go_s99_running ?
2753				       sdma_state_s99_running :
2754				       sdma_state_s20_idle);
2755			break;
2756		case sdma_event_e30_go_running:
2757			ss->go_s99_running = 1;
2758			break;
2759		case sdma_event_e40_sw_cleaned:
2760			break;
2761		case sdma_event_e50_hw_cleaned:
2762			break;
2763		case sdma_event_e60_hw_halted:
2764			break;
2765		case sdma_event_e70_go_idle:
2766			ss->go_s99_running = 0;
2767			break;
2768		case sdma_event_e80_hw_freeze:
2769			break;
2770		case sdma_event_e81_hw_frozen:
2771			break;
2772		case sdma_event_e82_hw_unfreeze:
2773			break;
2774		case sdma_event_e85_link_down:
2775			ss->go_s99_running = 0;
2776			break;
2777		case sdma_event_e90_sw_halted:
2778			break;
2779		}
2780		break;
2781
2782	case sdma_state_s50_hw_halt_wait:
2783		switch (event) {
2784		case sdma_event_e00_go_hw_down:
2785			sdma_set_state(sde, sdma_state_s00_hw_down);
2786			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
2787			break;
2788		case sdma_event_e10_go_hw_start:
2789			break;
2790		case sdma_event_e15_hw_halt_done:
2791			sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
2792			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
2793			break;
2794		case sdma_event_e25_hw_clean_up_done:
2795			break;
2796		case sdma_event_e30_go_running:
2797			ss->go_s99_running = 1;
2798			break;
2799		case sdma_event_e40_sw_cleaned:
2800			break;
2801		case sdma_event_e50_hw_cleaned:
2802			break;
2803		case sdma_event_e60_hw_halted:
2804			schedule_work(&sde->err_halt_worker);
2805			break;
2806		case sdma_event_e70_go_idle:
2807			ss->go_s99_running = 0;
2808			break;
2809		case sdma_event_e80_hw_freeze:
2810			break;
2811		case sdma_event_e81_hw_frozen:
2812			break;
2813		case sdma_event_e82_hw_unfreeze:
2814			break;
2815		case sdma_event_e85_link_down:
2816			ss->go_s99_running = 0;
2817			break;
2818		case sdma_event_e90_sw_halted:
2819			break;
2820		}
2821		break;
2822
2823	case sdma_state_s60_idle_halt_wait:
2824		switch (event) {
2825		case sdma_event_e00_go_hw_down:
2826			sdma_set_state(sde, sdma_state_s00_hw_down);
2827			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
2828			break;
2829		case sdma_event_e10_go_hw_start:
2830			break;
2831		case sdma_event_e15_hw_halt_done:
2832			sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
2833			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
2834			break;
2835		case sdma_event_e25_hw_clean_up_done:
2836			break;
2837		case sdma_event_e30_go_running:
2838			ss->go_s99_running = 1;
2839			break;
2840		case sdma_event_e40_sw_cleaned:
2841			break;
2842		case sdma_event_e50_hw_cleaned:
2843			break;
2844		case sdma_event_e60_hw_halted:
2845			schedule_work(&sde->err_halt_worker);
2846			break;
2847		case sdma_event_e70_go_idle:
2848			ss->go_s99_running = 0;
2849			break;
2850		case sdma_event_e80_hw_freeze:
2851			break;
2852		case sdma_event_e81_hw_frozen:
2853			break;
2854		case sdma_event_e82_hw_unfreeze:
2855			break;
2856		case sdma_event_e85_link_down:
2857			break;
2858		case sdma_event_e90_sw_halted:
2859			break;
2860		}
2861		break;
2862
2863	case sdma_state_s80_hw_freeze:
2864		switch (event) {
2865		case sdma_event_e00_go_hw_down:
2866			sdma_set_state(sde, sdma_state_s00_hw_down);
2867			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
2868			break;
2869		case sdma_event_e10_go_hw_start:
2870			break;
2871		case sdma_event_e15_hw_halt_done:
2872			break;
2873		case sdma_event_e25_hw_clean_up_done:
2874			break;
2875		case sdma_event_e30_go_running:
2876			ss->go_s99_running = 1;
2877			break;
2878		case sdma_event_e40_sw_cleaned:
2879			break;
2880		case sdma_event_e50_hw_cleaned:
2881			break;
2882		case sdma_event_e60_hw_halted:
2883			break;
2884		case sdma_event_e70_go_idle:
2885			ss->go_s99_running = 0;
2886			break;
2887		case sdma_event_e80_hw_freeze:
2888			break;
2889		case sdma_event_e81_hw_frozen:
2890			sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
2891			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
2892			break;
2893		case sdma_event_e82_hw_unfreeze:
2894			break;
2895		case sdma_event_e85_link_down:
2896			break;
2897		case sdma_event_e90_sw_halted:
2898			break;
2899		}
2900		break;
2901
2902	case sdma_state_s82_freeze_sw_clean:
2903		switch (event) {
2904		case sdma_event_e00_go_hw_down:
2905			sdma_set_state(sde, sdma_state_s00_hw_down);
2906			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
2907			break;
2908		case sdma_event_e10_go_hw_start:
2909			break;
2910		case sdma_event_e15_hw_halt_done:
2911			break;
2912		case sdma_event_e25_hw_clean_up_done:
2913			break;
2914		case sdma_event_e30_go_running:
2915			ss->go_s99_running = 1;
2916			break;
2917		case sdma_event_e40_sw_cleaned:
2918			/* notify caller this engine is done cleaning */
2919			atomic_dec(&sde->dd->sdma_unfreeze_count);
2920			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
2921			break;
2922		case sdma_event_e50_hw_cleaned:
2923			break;
2924		case sdma_event_e60_hw_halted:
2925			break;
2926		case sdma_event_e70_go_idle:
2927			ss->go_s99_running = 0;
2928			break;
2929		case sdma_event_e80_hw_freeze:
2930			break;
2931		case sdma_event_e81_hw_frozen:
2932			break;
2933		case sdma_event_e82_hw_unfreeze:
2934			sdma_hw_start_up(sde);
2935			sdma_set_state(sde, ss->go_s99_running ?
2936				       sdma_state_s99_running :
2937				       sdma_state_s20_idle);
2938			break;
2939		case sdma_event_e85_link_down:
2940			break;
2941		case sdma_event_e90_sw_halted:
2942			break;
2943		}
2944		break;
2945
2946	case sdma_state_s99_running:
2947		switch (event) {
2948		case sdma_event_e00_go_hw_down:
2949			sdma_set_state(sde, sdma_state_s00_hw_down);
2950			tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
2951			break;
2952		case sdma_event_e10_go_hw_start:
2953			break;
2954		case sdma_event_e15_hw_halt_done:
2955			break;
2956		case sdma_event_e25_hw_clean_up_done:
2957			break;
2958		case sdma_event_e30_go_running:
2959			break;
2960		case sdma_event_e40_sw_cleaned:
2961			break;
2962		case sdma_event_e50_hw_cleaned:
2963			break;
2964		case sdma_event_e60_hw_halted:
2965			need_progress = 1;
2966			sdma_err_progress_check_schedule(sde);
2967			fallthrough;
2968		case sdma_event_e90_sw_halted:
2969			/*
2970			* SW initiated halt does not perform engines
2971			* progress check
2972			*/
2973			sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
2974			schedule_work(&sde->err_halt_worker);
2975			break;
2976		case sdma_event_e70_go_idle:
2977			sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
2978			break;
2979		case sdma_event_e85_link_down:
2980			ss->go_s99_running = 0;
2981			fallthrough;
2982		case sdma_event_e80_hw_freeze:
2983			sdma_set_state(sde, sdma_state_s80_hw_freeze);
2984			atomic_dec(&sde->dd->sdma_unfreeze_count);
2985			wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
2986			break;
2987		case sdma_event_e81_hw_frozen:
2988			break;
2989		case sdma_event_e82_hw_unfreeze:
2990			break;
2991		}
2992		break;
2993	}
2994
2995	ss->last_event = event;
2996	if (need_progress)
2997		sdma_make_progress(sde, 0);
2998}
2999
3000/*
3001 * _extend_sdma_tx_descs() - helper to extend txreq
3002 *
3003 * This is called once the initial nominal allocation
3004 * of descriptors in the sdma_txreq is exhausted.
3005 *
3006 * The code will bump the allocation up to the max
3007 * of MAX_DESC (64) descriptors. There doesn't seem
3008 * much point in an interim step. The last descriptor
3009 * is reserved for coalesce buffer in order to support
3010 * cases where input packet has >MAX_DESC iovecs.
3011 *
3012 */
3013static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
3014{
3015	int i;
3016	struct sdma_desc *descp;
3017
3018	/* Handle last descriptor */
3019	if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
3020		/* if tlen is 0, it is for padding, release last descriptor */
3021		if (!tx->tlen) {
3022			tx->desc_limit = MAX_DESC;
3023		} else if (!tx->coalesce_buf) {
3024			/* allocate coalesce buffer with space for padding */
3025			tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
3026						   GFP_ATOMIC);
3027			if (!tx->coalesce_buf)
3028				goto enomem;
3029			tx->coalesce_idx = 0;
3030		}
3031		return 0;
3032	}
3033
3034	if (unlikely(tx->num_desc == MAX_DESC))
3035		goto enomem;
3036
3037	descp = kmalloc_array(MAX_DESC, sizeof(struct sdma_desc), GFP_ATOMIC);
3038	if (!descp)
3039		goto enomem;
3040	tx->descp = descp;
3041
3042	/* reserve last descriptor for coalescing */
3043	tx->desc_limit = MAX_DESC - 1;
3044	/* copy ones already built */
3045	for (i = 0; i < tx->num_desc; i++)
3046		tx->descp[i] = tx->descs[i];
3047	return 0;
3048enomem:
3049	__sdma_txclean(dd, tx);
3050	return -ENOMEM;
3051}
3052
3053/*
3054 * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
3055 *
3056 * This is called once the initial nominal allocation of descriptors
3057 * in the sdma_txreq is exhausted.
3058 *
3059 * This function calls _extend_sdma_tx_descs to extend or allocate
3060 * coalesce buffer. If there is a allocated coalesce buffer, it will
3061 * copy the input packet data into the coalesce buffer. It also adds
3062 * coalesce buffer descriptor once when whole packet is received.
3063 *
3064 * Return:
3065 * <0 - error
3066 * 0 - coalescing, don't populate descriptor
3067 * 1 - continue with populating descriptor
3068 */
3069int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
3070			   int type, void *kvaddr, struct page *page,
3071			   unsigned long offset, u16 len)
3072{
3073	int pad_len, rval;
3074	dma_addr_t addr;
3075
3076	rval = _extend_sdma_tx_descs(dd, tx);
3077	if (rval) {
3078		__sdma_txclean(dd, tx);
3079		return rval;
3080	}
3081
3082	/* If coalesce buffer is allocated, copy data into it */
3083	if (tx->coalesce_buf) {
3084		if (type == SDMA_MAP_NONE) {
3085			__sdma_txclean(dd, tx);
3086			return -EINVAL;
3087		}
3088
3089		if (type == SDMA_MAP_PAGE) {
3090			kvaddr = kmap_local_page(page);
3091			kvaddr += offset;
3092		} else if (WARN_ON(!kvaddr)) {
3093			__sdma_txclean(dd, tx);
3094			return -EINVAL;
3095		}
3096
3097		memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
3098		tx->coalesce_idx += len;
3099		if (type == SDMA_MAP_PAGE)
3100			kunmap_local(kvaddr);
3101
3102		/* If there is more data, return */
3103		if (tx->tlen - tx->coalesce_idx)
3104			return 0;
3105
3106		/* Whole packet is received; add any padding */
3107		pad_len = tx->packet_len & (sizeof(u32) - 1);
3108		if (pad_len) {
3109			pad_len = sizeof(u32) - pad_len;
3110			memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
3111			/* padding is taken care of for coalescing case */
3112			tx->packet_len += pad_len;
3113			tx->tlen += pad_len;
3114		}
3115
3116		/* dma map the coalesce buffer */
3117		addr = dma_map_single(&dd->pcidev->dev,
3118				      tx->coalesce_buf,
3119				      tx->tlen,
3120				      DMA_TO_DEVICE);
3121
3122		if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
3123			__sdma_txclean(dd, tx);
3124			return -ENOSPC;
3125		}
3126
3127		/* Add descriptor for coalesce buffer */
3128		tx->desc_limit = MAX_DESC;
3129		return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
3130					 addr, tx->tlen, NULL, NULL, NULL);
3131	}
3132
3133	return 1;
3134}
3135
3136/* Update sdes when the lmc changes */
3137void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
3138{
3139	struct sdma_engine *sde;
3140	int i;
3141	u64 sreg;
3142
3143	sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
3144		SD(CHECK_SLID_MASK_SHIFT)) |
3145		(((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
3146		SD(CHECK_SLID_VALUE_SHIFT));
3147
3148	for (i = 0; i < dd->num_sdma; i++) {
3149		hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
3150			  i, (u32)sreg);
3151		sde = &dd->per_sdma[i];
3152		write_sde_csr(sde, SD(CHECK_SLID), sreg);
3153	}
3154}
3155
3156/* tx not dword sized - pad */
3157int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
3158{
3159	int rval = 0;
3160
3161	if ((unlikely(tx->num_desc == tx->desc_limit))) {
3162		rval = _extend_sdma_tx_descs(dd, tx);
3163		if (rval) {
3164			__sdma_txclean(dd, tx);
3165			return rval;
3166		}
3167	}
3168
3169	/* finish the one just added */
3170	make_tx_sdma_desc(
3171		tx,
3172		SDMA_MAP_NONE,
3173		dd->sdma_pad_phys,
3174		sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)),
3175		NULL, NULL, NULL);
3176	tx->num_desc++;
3177	_sdma_close_tx(dd, tx);
3178	return rval;
3179}
3180
3181/*
3182 * Add ahg to the sdma_txreq
3183 *
3184 * The logic will consume up to 3
3185 * descriptors at the beginning of
3186 * sdma_txreq.
3187 */
3188void _sdma_txreq_ahgadd(
3189	struct sdma_txreq *tx,
3190	u8 num_ahg,
3191	u8 ahg_entry,
3192	u32 *ahg,
3193	u8 ahg_hlen)
3194{
3195	u32 i, shift = 0, desc = 0;
3196	u8 mode;
3197
3198	WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
3199	/* compute mode */
3200	if (num_ahg == 1)
3201		mode = SDMA_AHG_APPLY_UPDATE1;
3202	else if (num_ahg <= 5)
3203		mode = SDMA_AHG_APPLY_UPDATE2;
3204	else
3205		mode = SDMA_AHG_APPLY_UPDATE3;
3206	tx->num_desc++;
3207	/* initialize to consumed descriptors to zero */
3208	switch (mode) {
3209	case SDMA_AHG_APPLY_UPDATE3:
3210		tx->num_desc++;
3211		tx->descs[2].qw[0] = 0;
3212		tx->descs[2].qw[1] = 0;
3213		fallthrough;
3214	case SDMA_AHG_APPLY_UPDATE2:
3215		tx->num_desc++;
3216		tx->descs[1].qw[0] = 0;
3217		tx->descs[1].qw[1] = 0;
3218		break;
3219	}
3220	ahg_hlen >>= 2;
3221	tx->descs[0].qw[1] |=
3222		(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
3223			<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
3224		(((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
3225			<< SDMA_DESC1_HEADER_DWS_SHIFT) |
3226		(((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
3227			<< SDMA_DESC1_HEADER_MODE_SHIFT) |
3228		(((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
3229			<< SDMA_DESC1_HEADER_UPDATE1_SHIFT);
3230	for (i = 0; i < (num_ahg - 1); i++) {
3231		if (!shift && !(i & 2))
3232			desc++;
3233		tx->descs[desc].qw[!!(i & 2)] |=
3234			(((u64)ahg[i + 1])
3235				<< shift);
3236		shift = (shift + 32) & 63;
3237	}
3238}
3239
3240/**
3241 * sdma_ahg_alloc - allocate an AHG entry
3242 * @sde: engine to allocate from
3243 *
3244 * Return:
3245 * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
3246 * -ENOSPC if an entry is not available
3247 */
3248int sdma_ahg_alloc(struct sdma_engine *sde)
3249{
3250	int nr;
3251	int oldbit;
3252
3253	if (!sde) {
3254		trace_hfi1_ahg_allocate(sde, -EINVAL);
3255		return -EINVAL;
3256	}
3257	while (1) {
3258		nr = ffz(READ_ONCE(sde->ahg_bits));
3259		if (nr > 31) {
3260			trace_hfi1_ahg_allocate(sde, -ENOSPC);
3261			return -ENOSPC;
3262		}
3263		oldbit = test_and_set_bit(nr, &sde->ahg_bits);
3264		if (!oldbit)
3265			break;
3266		cpu_relax();
3267	}
3268	trace_hfi1_ahg_allocate(sde, nr);
3269	return nr;
3270}
3271
3272/**
3273 * sdma_ahg_free - free an AHG entry
3274 * @sde: engine to return AHG entry
3275 * @ahg_index: index to free
3276 *
3277 * This routine frees the indicate AHG entry.
3278 */
3279void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
3280{
3281	if (!sde)
3282		return;
3283	trace_hfi1_ahg_deallocate(sde, ahg_index);
3284	if (ahg_index < 0 || ahg_index > 31)
3285		return;
3286	clear_bit(ahg_index, &sde->ahg_bits);
3287}
3288
3289/*
3290 * SPC freeze handling for SDMA engines.  Called when the driver knows
3291 * the SPC is going into a freeze but before the freeze is fully
3292 * settled.  Generally an error interrupt.
3293 *
3294 * This event will pull the engine out of running so no more entries can be
3295 * added to the engine's queue.
3296 */
3297void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
3298{
3299	int i;
3300	enum sdma_events event = link_down ? sdma_event_e85_link_down :
3301					     sdma_event_e80_hw_freeze;
3302
3303	/* set up the wait but do not wait here */
3304	atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
3305
3306	/* tell all engines to stop running and wait */
3307	for (i = 0; i < dd->num_sdma; i++)
3308		sdma_process_event(&dd->per_sdma[i], event);
3309
3310	/* sdma_freeze() will wait for all engines to have stopped */
3311}
3312
3313/*
3314 * SPC freeze handling for SDMA engines.  Called when the driver knows
3315 * the SPC is fully frozen.
3316 */
3317void sdma_freeze(struct hfi1_devdata *dd)
3318{
3319	int i;
3320	int ret;
3321
3322	/*
3323	 * Make sure all engines have moved out of the running state before
3324	 * continuing.
3325	 */
3326	ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
3327				       atomic_read(&dd->sdma_unfreeze_count) <=
3328				       0);
3329	/* interrupted or count is negative, then unloading - just exit */
3330	if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
3331		return;
3332
3333	/* set up the count for the next wait */
3334	atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
3335
3336	/* tell all engines that the SPC is frozen, they can start cleaning */
3337	for (i = 0; i < dd->num_sdma; i++)
3338		sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
3339
3340	/*
3341	 * Wait for everyone to finish software clean before exiting.  The
3342	 * software clean will read engine CSRs, so must be completed before
3343	 * the next step, which will clear the engine CSRs.
3344	 */
3345	(void)wait_event_interruptible(dd->sdma_unfreeze_wq,
3346				atomic_read(&dd->sdma_unfreeze_count) <= 0);
3347	/* no need to check results - done no matter what */
3348}
3349
3350/*
3351 * SPC freeze handling for the SDMA engines.  Called after the SPC is unfrozen.
3352 *
3353 * The SPC freeze acts like a SDMA halt and a hardware clean combined.  All
3354 * that is left is a software clean.  We could do it after the SPC is fully
3355 * frozen, but then we'd have to add another state to wait for the unfreeze.
3356 * Instead, just defer the software clean until the unfreeze step.
3357 */
3358void sdma_unfreeze(struct hfi1_devdata *dd)
3359{
3360	int i;
3361
3362	/* tell all engines start freeze clean up */
3363	for (i = 0; i < dd->num_sdma; i++)
3364		sdma_process_event(&dd->per_sdma[i],
3365				   sdma_event_e82_hw_unfreeze);
3366}
3367
3368/**
3369 * _sdma_engine_progress_schedule() - schedule progress on engine
3370 * @sde: sdma_engine to schedule progress
3371 *
3372 */
3373void _sdma_engine_progress_schedule(
3374	struct sdma_engine *sde)
3375{
3376	trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
3377	/* assume we have selected a good cpu */
3378	write_csr(sde->dd,
3379		  CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)),
3380		  sde->progress_mask);
3381}
3382