1139823Simp// SPDX-License-Identifier: GPL-2.0
2139365Srik
3139365Srik/*
4139365Srik * Copyright 2016-2021 HabanaLabs, Ltd.
5139365Srik * All Rights Reserved.
6139365Srik */
7139365Srik
8139365Srik#include <uapi/drm/habanalabs_accel.h>
9139365Srik#include "habanalabs.h"
10139365Srik
11139365Srik#include <linux/uaccess.h>
12139365Srik#include <linux/slab.h>
13139365Srik
14139365Srik#define HL_CS_FLAGS_TYPE_MASK	(HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
15139365Srik			HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \
16139365Srik			HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND | \
17139365Srik			HL_CS_FLAGS_ENGINES_COMMAND | HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES)
18139365Srik
19139365Srik
20139365Srik#define MAX_TS_ITER_NUM 100
21139365Srik
22139365Srik/**
23139365Srik * enum hl_cs_wait_status - cs wait status
24139365Srik * @CS_WAIT_STATUS_BUSY: cs was not completed yet
25139365Srik * @CS_WAIT_STATUS_COMPLETED: cs completed
26139365Srik * @CS_WAIT_STATUS_GONE: cs completed but fence is already gone
27139365Srik */
28139365Srikenum hl_cs_wait_status {
29139365Srik	CS_WAIT_STATUS_BUSY,
30139365Srik	CS_WAIT_STATUS_COMPLETED,
31139365Srik	CS_WAIT_STATUS_GONE
32139365Srik};
33139365Srik
34139365Srik/*
35139365Srik * Data used while handling wait/timestamp nodes.
36139365Srik * The purpose of this struct is to store the needed data for both operations
37139365Srik * in one variable instead of passing large number of arguments to functions.
38139365Srik */
39139365Srikstruct wait_interrupt_data {
40139365Srik	struct hl_user_interrupt *interrupt;
41139365Srik	struct hl_mmap_mem_buf *buf;
42139365Srik	struct hl_mem_mgr *mmg;
43139365Srik	struct hl_cb *cq_cb;
44139365Srik	u64 ts_handle;
45139365Srik	u64 ts_offset;
46139365Srik	u64 cq_handle;
47139365Srik	u64 cq_offset;
48139365Srik	u64 target_value;
49139365Srik	u64 intr_timeout_us;
50139365Srik};
51139365Srik
52139365Srikstatic void job_wq_completion(struct work_struct *work);
53139365Srikstatic int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq,
54139365Srik				enum hl_cs_wait_status *status, s64 *timestamp);
55139365Srikstatic void cs_do_release(struct kref *ref);
56139365Srik
57139365Srikstatic void hl_push_cs_outcome(struct hl_device *hdev,
58139365Srik			       struct hl_cs_outcome_store *outcome_store,
59139365Srik			       u64 seq, ktime_t ts, int error)
60139365Srik{
61139365Srik	struct hl_cs_outcome *node;
62139365Srik	unsigned long flags;
63139365Srik
64139365Srik	/*
65139365Srik	 * CS outcome store supports the following operations:
66139365Srik	 * push outcome - store a recent CS outcome in the store
67139365Srik	 * pop outcome - retrieve a SPECIFIC (by seq) CS outcome from the store
68139365Srik	 * It uses 2 lists: used list and free list.
69139365Srik	 * It has a pre-allocated amount of nodes, each node stores
70139365Srik	 * a single CS outcome.
71139365Srik	 * Initially, all the nodes are in the free list.
72139365Srik	 * On push outcome, a node (any) is taken from the free list, its
73139365Srik	 * information is filled in, and the node is moved to the used list.
74139365Srik	 * It is possible, that there are no nodes left in the free list.
75139365Srik	 * In this case, we will lose some information about old outcomes. We
76139365Srik	 * will pop the OLDEST node from the used list, and make it free.
77139365Srik	 * On pop, the node is searched for in the used list (using a search
78139365Srik	 * index).
79139365Srik	 * If found, the node is then removed from the used list, and moved
80139365Srik	 * back to the free list. The outcome data that the node contained is
81139365Srik	 * returned back to the user.
82139365Srik	 */
83139365Srik
84139365Srik	spin_lock_irqsave(&outcome_store->db_lock, flags);
85139365Srik
86139365Srik	if (list_empty(&outcome_store->free_list)) {
87139365Srik		node = list_last_entry(&outcome_store->used_list,
88139365Srik				       struct hl_cs_outcome, list_link);
89139365Srik		hash_del(&node->map_link);
90139365Srik		dev_dbg(hdev->dev, "CS %llu outcome was lost\n", node->seq);
91139365Srik	} else {
92139365Srik		node = list_last_entry(&outcome_store->free_list,
93139365Srik				       struct hl_cs_outcome, list_link);
94139365Srik	}
95139365Srik
96139365Srik	list_del_init(&node->list_link);
97139365Srik
98139365Srik	node->seq = seq;
99139365Srik	node->ts = ts;
100139365Srik	node->error = error;
101139365Srik
102139365Srik	list_add(&node->list_link, &outcome_store->used_list);
103139365Srik	hash_add(outcome_store->outcome_map, &node->map_link, node->seq);
104139365Srik
105139365Srik	spin_unlock_irqrestore(&outcome_store->db_lock, flags);
106139365Srik}
107139365Srik
108139365Srikstatic bool hl_pop_cs_outcome(struct hl_cs_outcome_store *outcome_store,
109139365Srik			       u64 seq, ktime_t *ts, int *error)
110139365Srik{
111139365Srik	struct hl_cs_outcome *node;
112139365Srik	unsigned long flags;
113139365Srik
114139365Srik	spin_lock_irqsave(&outcome_store->db_lock, flags);
115139365Srik
116139365Srik	hash_for_each_possible(outcome_store->outcome_map, node, map_link, seq)
117139365Srik		if (node->seq == seq) {
118139365Srik			*ts = node->ts;
119139365Srik			*error = node->error;
120139365Srik
121139365Srik			hash_del(&node->map_link);
122139365Srik			list_del_init(&node->list_link);
123139365Srik			list_add(&node->list_link, &outcome_store->free_list);
124139365Srik
125139365Srik			spin_unlock_irqrestore(&outcome_store->db_lock, flags);
126139365Srik
127139365Srik			return true;
128139365Srik		}
129139365Srik
130139365Srik	spin_unlock_irqrestore(&outcome_store->db_lock, flags);
131139365Srik
132139365Srik	return false;
133139365Srik}
134139365Srik
135139365Srikstatic void hl_sob_reset(struct kref *ref)
136139365Srik{
137139365Srik	struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
138139365Srik							kref);
139139365Srik	struct hl_device *hdev = hw_sob->hdev;
140139365Srik
141139365Srik	dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id);
142139365Srik
143139365Srik	hdev->asic_funcs->reset_sob(hdev, hw_sob);
144139365Srik
145139365Srik	hw_sob->need_reset = false;
146139365Srik}
147139365Srik
148139365Srikvoid hl_sob_reset_error(struct kref *ref)
149139365Srik{
150139365Srik	struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob,
151139365Srik							kref);
152139365Srik	struct hl_device *hdev = hw_sob->hdev;
153139365Srik
154139365Srik	dev_crit(hdev->dev,
155139365Srik		"SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n",
156139365Srik		hw_sob->q_idx, hw_sob->sob_id);
157139365Srik}
158139365Srik
159139365Srikvoid hw_sob_put(struct hl_hw_sob *hw_sob)
160139365Srik{
161139365Srik	if (hw_sob)
162147256Sbrooks		kref_put(&hw_sob->kref, hl_sob_reset);
163139365Srik}
164139365Srik
165139365Srikstatic void hw_sob_put_err(struct hl_hw_sob *hw_sob)
166139365Srik{
167139365Srik	if (hw_sob)
168139365Srik		kref_put(&hw_sob->kref, hl_sob_reset_error);
169139365Srik}
170139365Srik
171139365Srikvoid hw_sob_get(struct hl_hw_sob *hw_sob)
172139365Srik{
173139365Srik	if (hw_sob)
174139365Srik		kref_get(&hw_sob->kref);
175139365Srik}
176139365Srik
177139365Srik/**
178139365Srik * hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet
179139365Srik * @sob_base: sob base id
180139365Srik * @sob_mask: sob user mask, each bit represents a sob offset from sob base
181139365Srik * @mask: generated mask
182139365Srik *
183139365Srik * Return: 0 if given parameters are valid
184139365Srik */
185139365Srikint hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
186139365Srik{
187139365Srik	int i;
188139365Srik
189139365Srik	if (sob_mask == 0)
190139365Srik		return -EINVAL;
191139365Srik
192139365Srik	if (sob_mask == 0x1) {
193139365Srik		*mask = ~(1 << (sob_base & 0x7));
194139365Srik	} else {
195139365Srik		/* find msb in order to verify sob range is valid */
196139365Srik		for (i = BITS_PER_BYTE - 1 ; i >= 0 ; i--)
197139365Srik			if (BIT(i) & sob_mask)
198139365Srik				break;
199139365Srik
200139365Srik		if (i > (HL_MAX_SOBS_PER_MONITOR - (sob_base & 0x7) - 1))
201139365Srik			return -EINVAL;
202139365Srik
203139365Srik		*mask = ~sob_mask;
204139365Srik	}
205139365Srik
206139365Srik	return 0;
207139365Srik}
208139365Srik
209139365Srikstatic void hl_fence_release(struct kref *kref)
210139365Srik{
211139365Srik	struct hl_fence *fence =
212139365Srik		container_of(kref, struct hl_fence, refcount);
213139365Srik	struct hl_cs_compl *hl_cs_cmpl =
214139365Srik		container_of(fence, struct hl_cs_compl, base_fence);
215139365Srik
216139365Srik	kfree(hl_cs_cmpl);
217139365Srik}
218139365Srik
219139365Srikvoid hl_fence_put(struct hl_fence *fence)
220139365Srik{
221139365Srik	if (IS_ERR_OR_NULL(fence))
222139365Srik		return;
223139365Srik	kref_put(&fence->refcount, hl_fence_release);
224139365Srik}
225139365Srik
226139365Srikvoid hl_fences_put(struct hl_fence **fence, int len)
227139365Srik{
228139365Srik	int i;
229139365Srik
230139365Srik	for (i = 0; i < len; i++, fence++)
231139365Srik		hl_fence_put(*fence);
232139365Srik}
233139365Srik
234139365Srikvoid hl_fence_get(struct hl_fence *fence)
235139365Srik{
236139365Srik	if (fence)
237139365Srik		kref_get(&fence->refcount);
238139365Srik}
239139365Srik
240139365Srikstatic void hl_fence_init(struct hl_fence *fence, u64 sequence)
241139365Srik{
242139365Srik	kref_init(&fence->refcount);
243139365Srik	fence->cs_sequence = sequence;
244139365Srik	fence->error = 0;
245139365Srik	fence->timestamp = ktime_set(0, 0);
246139365Srik	fence->mcs_handling_done = false;
247139365Srik	init_completion(&fence->completion);
248139365Srik}
249139365Srik
250139365Srikvoid cs_get(struct hl_cs *cs)
251139365Srik{
252139365Srik	kref_get(&cs->refcount);
253139365Srik}
254139365Srik
255139365Srikstatic int cs_get_unless_zero(struct hl_cs *cs)
256139365Srik{
257139365Srik	return kref_get_unless_zero(&cs->refcount);
258139365Srik}
259139365Srik
260139365Srikstatic void cs_put(struct hl_cs *cs)
261139365Srik{
262139365Srik	kref_put(&cs->refcount, cs_do_release);
263139365Srik}
264139365Srik
265139365Srikstatic void cs_job_do_release(struct kref *ref)
266139365Srik{
267139365Srik	struct hl_cs_job *job = container_of(ref, struct hl_cs_job, refcount);
268139365Srik
269139365Srik	kfree(job);
270139365Srik}
271139365Srik
272139365Srikstatic void hl_cs_job_put(struct hl_cs_job *job)
273139365Srik{
274139365Srik	kref_put(&job->refcount, cs_job_do_release);
275139365Srik}
276139365Srik
277139365Srikbool cs_needs_completion(struct hl_cs *cs)
278139365Srik{
279139365Srik	/* In case this is a staged CS, only the last CS in sequence should
280139365Srik	 * get a completion, any non staged CS will always get a completion
281139365Srik	 */
282139365Srik	if (cs->staged_cs && !cs->staged_last)
283139365Srik		return false;
284139365Srik
285139365Srik	return true;
286139365Srik}
287139365Srik
288139365Srikbool cs_needs_timeout(struct hl_cs *cs)
289139365Srik{
290139365Srik	/* In case this is a staged CS, only the first CS in sequence should
291139365Srik	 * get a timeout, any non staged CS will always get a timeout
292139365Srik	 */
293139365Srik	if (cs->staged_cs && !cs->staged_first)
294139365Srik		return false;
295139365Srik
296139365Srik	return true;
297139365Srik}
298139365Srik
299139365Srikstatic bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
300139365Srik{
301139365Srik	/* Patched CB is created for external queues jobs */
302139365Srik	return (job->queue_type == QUEUE_TYPE_EXT);
303139365Srik}
304139365Srik
305139365Srik/*
306139365Srik * cs_parser - parse the user command submission
307139365Srik *
308139365Srik * @hpriv	: pointer to the private data of the fd
309139365Srik * @job        : pointer to the job that holds the command submission info
310139365Srik *
311139365Srik * The function parses the command submission of the user. It calls the
312139365Srik * ASIC specific parser, which returns a list of memory blocks to send
313139365Srik * to the device as different command buffers
314139365Srik *
315139365Srik */
316139365Srikstatic int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
317139365Srik{
318139365Srik	struct hl_device *hdev = hpriv->hdev;
319139365Srik	struct hl_cs_parser parser;
320139365Srik	int rc;
321139365Srik
322139365Srik	parser.ctx_id = job->cs->ctx->asid;
323139365Srik	parser.cs_sequence = job->cs->sequence;
324139365Srik	parser.job_id = job->id;
325139365Srik
326139365Srik	parser.hw_queue_id = job->hw_queue_id;
327139365Srik	parser.job_userptr_list = &job->userptr_list;
328139365Srik	parser.patched_cb = NULL;
329139365Srik	parser.user_cb = job->user_cb;
330139365Srik	parser.user_cb_size = job->user_cb_size;
331139365Srik	parser.queue_type = job->queue_type;
332139365Srik	parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
333139365Srik	job->patched_cb = NULL;
334139365Srik	parser.completion = cs_needs_completion(job->cs);
335139365Srik
336139365Srik	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
337139365Srik
338139365Srik	if (is_cb_patched(hdev, job)) {
339139365Srik		if (!rc) {
340139365Srik			job->patched_cb = parser.patched_cb;
341139365Srik			job->job_cb_size = parser.patched_cb_size;
342139365Srik			job->contains_dma_pkt = parser.contains_dma_pkt;
343139365Srik			atomic_inc(&job->patched_cb->cs_cnt);
344139365Srik		}
345139365Srik
346139365Srik		/*
347139365Srik		 * Whether the parsing worked or not, we don't need the
348139365Srik		 * original CB anymore because it was already parsed and
349139365Srik		 * won't be accessed again for this CS
350139365Srik		 */
351139365Srik		atomic_dec(&job->user_cb->cs_cnt);
352139365Srik		hl_cb_put(job->user_cb);
353139365Srik		job->user_cb = NULL;
354139365Srik	} else if (!rc) {
355139365Srik		job->job_cb_size = job->user_cb_size;
356139365Srik	}
357139365Srik
358139365Srik	return rc;
359139365Srik}
360139365Srik
361139365Srikstatic void hl_complete_job(struct hl_device *hdev, struct hl_cs_job *job)
362139365Srik{
363139365Srik	struct hl_cs *cs = job->cs;
364139365Srik
365139365Srik	if (is_cb_patched(hdev, job)) {
366139365Srik		hl_userptr_delete_list(hdev, &job->userptr_list);
367139365Srik
368139365Srik		/*
369139365Srik		 * We might arrive here from rollback and patched CB wasn't
370139365Srik		 * created, so we need to check it's not NULL
371139365Srik		 */
372139365Srik		if (job->patched_cb) {
373139365Srik			atomic_dec(&job->patched_cb->cs_cnt);
374139365Srik			hl_cb_put(job->patched_cb);
375139365Srik		}
376139365Srik	}
377139365Srik
378139365Srik	/* For H/W queue jobs, if a user CB was allocated by driver,
379139365Srik	 * the user CB isn't released in cs_parser() and thus should be
380139365Srik	 * released here. This is also true for INT queues jobs which were
381139365Srik	 * allocated by driver.
382139365Srik	 */
383139365Srik	if (job->is_kernel_allocated_cb &&
384139365Srik			(job->queue_type == QUEUE_TYPE_HW || job->queue_type == QUEUE_TYPE_INT)) {
385139365Srik		atomic_dec(&job->user_cb->cs_cnt);
386139365Srik		hl_cb_put(job->user_cb);
387139365Srik	}
388139365Srik
389139365Srik	/*
390139365Srik	 * This is the only place where there can be multiple threads
391139365Srik	 * modifying the list at the same time
392139365Srik	 */
393139365Srik	spin_lock(&cs->job_lock);
394139365Srik	list_del(&job->cs_node);
395139365Srik	spin_unlock(&cs->job_lock);
396139365Srik
397139365Srik	hl_debugfs_remove_job(hdev, job);
398139365Srik
399139365Srik	/* We decrement reference only for a CS that gets completion
400139365Srik	 * because the reference was incremented only for this kind of CS
401139365Srik	 * right before it was scheduled.
402139365Srik	 *
403139365Srik	 * In staged submission, only the last CS marked as 'staged_last'
404139365Srik	 * gets completion, hence its release function will be called from here.
405139365Srik	 * As for all the rest CS's in the staged submission which do not get
406139365Srik	 * completion, their CS reference will be decremented by the
407139365Srik	 * 'staged_last' CS during the CS release flow.
408139365Srik	 * All relevant PQ CI counters will be incremented during the CS release
409139365Srik	 * flow by calling 'hl_hw_queue_update_ci'.
410139365Srik	 */
411139365Srik	if (cs_needs_completion(cs) &&
412139365Srik			(job->queue_type == QUEUE_TYPE_EXT || job->queue_type == QUEUE_TYPE_HW)) {
413139365Srik
414139365Srik		/* In CS based completions, the timestamp is already available,
415139365Srik		 * so no need to extract it from job
416139365Srik		 */
417139365Srik		if (hdev->asic_prop.completion_mode == HL_COMPLETION_MODE_JOB)
418139365Srik			cs->completion_timestamp = job->timestamp;
419139365Srik
420139365Srik		cs_put(cs);
421139365Srik	}
422139365Srik
423139365Srik	hl_cs_job_put(job);
424139365Srik}
425139365Srik
426139365Srik/*
427139365Srik * hl_staged_cs_find_first - locate the first CS in this staged submission
428139365Srik *
429139365Srik * @hdev: pointer to device structure
430139365Srik * @cs_seq: staged submission sequence number
431139365Srik *
432139365Srik * @note: This function must be called under 'hdev->cs_mirror_lock'
433139365Srik *
434139365Srik * Find and return a CS pointer with the given sequence
435139365Srik */
436139365Srikstruct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq)
437139365Srik{
438139365Srik	struct hl_cs *cs;
439139365Srik
440139365Srik	list_for_each_entry_reverse(cs, &hdev->cs_mirror_list, mirror_node)
441139365Srik		if (cs->staged_cs && cs->staged_first &&
442139365Srik				cs->sequence == cs_seq)
443139365Srik			return cs;
444139365Srik
445139365Srik	return NULL;
446139365Srik}
447139365Srik
448139365Srik/*
449139365Srik * is_staged_cs_last_exists - returns true if the last CS in sequence exists
450139365Srik *
451139365Srik * @hdev: pointer to device structure
452139365Srik * @cs: staged submission member
453139365Srik *
454139365Srik */
455139365Srikbool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs)
456139365Srik{
457139365Srik	struct hl_cs *last_entry;
458139365Srik
459139365Srik	last_entry = list_last_entry(&cs->staged_cs_node, struct hl_cs,
460139365Srik								staged_cs_node);
461139365Srik
462139365Srik	if (last_entry->staged_last)
463139365Srik		return true;
464139365Srik
465139365Srik	return false;
466139365Srik}
467139365Srik
468139365Srik/*
469139365Srik * staged_cs_get - get CS reference if this CS is a part of a staged CS
470139365Srik *
471139365Srik * @hdev: pointer to device structure
472139365Srik * @cs: current CS
473139365Srik * @cs_seq: staged submission sequence number
474139365Srik *
475139365Srik * Increment CS reference for every CS in this staged submission except for
476139365Srik * the CS which get completion.
477139365Srik */
478139365Srikstatic void staged_cs_get(struct hl_device *hdev, struct hl_cs *cs)
479139365Srik{
480139365Srik	/* Only the last CS in this staged submission will get a completion.
481139365Srik	 * We must increment the reference for all other CS's in this
482139365Srik	 * staged submission.
483139365Srik	 * Once we get a completion we will release the whole staged submission.
484139365Srik	 */
485139365Srik	if (!cs->staged_last)
486139365Srik		cs_get(cs);
487139365Srik}
488139365Srik
489139365Srik/*
490139365Srik * staged_cs_put - put a CS in case it is part of staged submission
491139365Srik *
492139365Srik * @hdev: pointer to device structure
493139365Srik * @cs: CS to put
494139365Srik *
495139365Srik * This function decrements a CS reference (for a non completion CS)
496139365Srik */
497139365Srikstatic void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs)
498139365Srik{
499139365Srik	/* We release all CS's in a staged submission except the last
500139365Srik	 * CS which we have never incremented its reference.
501139365Srik	 */
502139365Srik	if (!cs_needs_completion(cs))
503139365Srik		cs_put(cs);
504139365Srik}
505139365Srik
506139365Srikstatic void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
507139365Srik{
508139365Srik	struct hl_cs *next = NULL, *iter, *first_cs;
509139365Srik
510139365Srik	if (!cs_needs_timeout(cs))
511139365Srik		return;
512139365Srik
513139365Srik	spin_lock(&hdev->cs_mirror_lock);
514139365Srik
515139365Srik	/* We need to handle tdr only once for the complete staged submission.
516139365Srik	 * Hence, we choose the CS that reaches this function first which is
517139365Srik	 * the CS marked as 'staged_last'.
518139365Srik	 * In case single staged cs was submitted which has both first and last
519139365Srik	 * indications, then "cs_find_first" below will return NULL, since we
520139365Srik	 * removed the cs node from the list before getting here,
521139365Srik	 * in such cases just continue with the cs to cancel it's TDR work.
522139365Srik	 */
523139365Srik	if (cs->staged_cs && cs->staged_last) {
524139365Srik		first_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence);
525139365Srik		if (first_cs)
526139365Srik			cs = first_cs;
527139365Srik	}
528139365Srik
529139365Srik	spin_unlock(&hdev->cs_mirror_lock);
530139365Srik
531139365Srik	/* Don't cancel TDR in case this CS was timedout because we might be
532139365Srik	 * running from the TDR context
533139365Srik	 */
534139365Srik	if (cs->timedout || hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT)
535139365Srik		return;
536139365Srik
537139365Srik	if (cs->tdr_active)
538139365Srik		cancel_delayed_work_sync(&cs->work_tdr);
539139365Srik
540139365Srik	spin_lock(&hdev->cs_mirror_lock);
541139365Srik
542139365Srik	/* queue TDR for next CS */
543139365Srik	list_for_each_entry(iter, &hdev->cs_mirror_list, mirror_node)
544139365Srik		if (cs_needs_timeout(iter)) {
545139365Srik			next = iter;
546139365Srik			break;
547139365Srik		}
548139365Srik
549139365Srik	if (next && !next->tdr_active) {
550139365Srik		next->tdr_active = true;
551139365Srik		schedule_delayed_work(&next->work_tdr, next->timeout_jiffies);
552139365Srik	}
553139365Srik
554139365Srik	spin_unlock(&hdev->cs_mirror_lock);
555139365Srik}
556139365Srik
557139365Srik/*
558139365Srik * force_complete_multi_cs - complete all contexts that wait on multi-CS
559139365Srik *
560139365Srik * @hdev: pointer to habanalabs device structure
561139365Srik */
562139365Srikstatic void force_complete_multi_cs(struct hl_device *hdev)
563139365Srik{
564139365Srik	int i;
565139365Srik
566139365Srik	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
567139365Srik		struct multi_cs_completion *mcs_compl;
568139365Srik
569139365Srik		mcs_compl = &hdev->multi_cs_completion[i];
570139365Srik
571139365Srik		spin_lock(&mcs_compl->lock);
572139365Srik
573139365Srik		if (!mcs_compl->used) {
574139365Srik			spin_unlock(&mcs_compl->lock);
575139365Srik			continue;
576139365Srik		}
577139365Srik
578139365Srik		/* when calling force complete no context should be waiting on
579139365Srik		 * multi-cS.
580139365Srik		 * We are calling the function as a protection for such case
581139365Srik		 * to free any pending context and print error message
582139365Srik		 */
583139365Srik		dev_err(hdev->dev,
584139365Srik				"multi-CS completion context %d still waiting when calling force completion\n",
585139365Srik				i);
586139365Srik		complete_all(&mcs_compl->completion);
587139365Srik		spin_unlock(&mcs_compl->lock);
588139365Srik	}
589139365Srik}
590139365Srik
591139365Srik/*
592139365Srik * complete_multi_cs - complete all waiting entities on multi-CS
593139365Srik *
594139365Srik * @hdev: pointer to habanalabs device structure
595139365Srik * @cs: CS structure
596139365Srik * The function signals a waiting entity that has an overlapping stream masters
597139365Srik * with the completed CS.
598139365Srik * For example:
599139365Srik * - a completed CS worked on stream master QID 4, multi CS completion
600139365Srik *   is actively waiting on stream master QIDs 3, 5. don't send signal as no
601139365Srik *   common stream master QID
602139365Srik * - a completed CS worked on stream master QID 4, multi CS completion
603139365Srik *   is actively waiting on stream master QIDs 3, 4. send signal as stream
604139365Srik *   master QID 4 is common
605139365Srik */
606139365Srikstatic void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
607139365Srik{
608139365Srik	struct hl_fence *fence = cs->fence;
609139365Srik	int i;
610139365Srik
611139365Srik	/* in case of multi CS check for completion only for the first CS */
612139365Srik	if (cs->staged_cs && !cs->staged_first)
613139365Srik		return;
614139365Srik
615139365Srik	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
616139365Srik		struct multi_cs_completion *mcs_compl;
617139365Srik
618139365Srik		mcs_compl = &hdev->multi_cs_completion[i];
619139365Srik		if (!mcs_compl->used)
620139365Srik			continue;
621139365Srik
622139365Srik		spin_lock(&mcs_compl->lock);
623139365Srik
624139365Srik		/*
625139365Srik		 * complete if:
626139365Srik		 * 1. still waiting for completion
627139365Srik		 * 2. the completed CS has at least one overlapping stream
628139365Srik		 *    master with the stream masters in the completion
629139365Srik		 */
630139365Srik		if (mcs_compl->used &&
631139365Srik				(fence->stream_master_qid_map &
632139365Srik					mcs_compl->stream_master_qid_map)) {
633139365Srik			/* extract the timestamp only of first completed CS */
634139365Srik			if (!mcs_compl->timestamp)
635				mcs_compl->timestamp = ktime_to_ns(fence->timestamp);
636
637			complete_all(&mcs_compl->completion);
638
639			/*
640			 * Setting mcs_handling_done inside the lock ensures
641			 * at least one fence have mcs_handling_done set to
642			 * true before wait for mcs finish. This ensures at
643			 * least one CS will be set as completed when polling
644			 * mcs fences.
645			 */
646			fence->mcs_handling_done = true;
647		}
648
649		spin_unlock(&mcs_compl->lock);
650	}
651	/* In case CS completed without mcs completion initialized */
652	fence->mcs_handling_done = true;
653}
654
655static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
656					struct hl_cs *cs,
657					struct hl_cs_compl *hl_cs_cmpl)
658{
659	/* Skip this handler if the cs wasn't submitted, to avoid putting
660	 * the hw_sob twice, since this case already handled at this point,
661	 * also skip if the hw_sob pointer wasn't set.
662	 */
663	if (!hl_cs_cmpl->hw_sob || !cs->submitted)
664		return;
665
666	spin_lock(&hl_cs_cmpl->lock);
667
668	/*
669	 * we get refcount upon reservation of signals or signal/wait cs for the
670	 * hw_sob object, and need to put it when the first staged cs
671	 * (which contains the encaps signals) or cs signal/wait is completed.
672	 */
673	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
674			(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
675			(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) ||
676			(!!hl_cs_cmpl->encaps_signals)) {
677		dev_dbg(hdev->dev,
678				"CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n",
679				hl_cs_cmpl->cs_seq,
680				hl_cs_cmpl->type,
681				hl_cs_cmpl->hw_sob->sob_id,
682				hl_cs_cmpl->sob_val);
683
684		hw_sob_put(hl_cs_cmpl->hw_sob);
685
686		if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
687			hdev->asic_funcs->reset_sob_group(hdev,
688					hl_cs_cmpl->sob_group);
689	}
690
691	spin_unlock(&hl_cs_cmpl->lock);
692}
693
694static void cs_do_release(struct kref *ref)
695{
696	struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
697	struct hl_device *hdev = cs->ctx->hdev;
698	struct hl_cs_job *job, *tmp;
699	struct hl_cs_compl *hl_cs_cmpl =
700			container_of(cs->fence, struct hl_cs_compl, base_fence);
701
702	cs->completed = true;
703
704	/*
705	 * Although if we reached here it means that all external jobs have
706	 * finished, because each one of them took refcnt to CS, we still
707	 * need to go over the internal jobs and complete them. Otherwise, we
708	 * will have leaked memory and what's worse, the CS object (and
709	 * potentially the CTX object) could be released, while the JOB
710	 * still holds a pointer to them (but no reference).
711	 */
712	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
713		hl_complete_job(hdev, job);
714
715	if (!cs->submitted) {
716		/*
717		 * In case the wait for signal CS was submitted, the fence put
718		 * occurs in init_signal_wait_cs() or collective_wait_init_cs()
719		 * right before hanging on the PQ.
720		 */
721		if (cs->type == CS_TYPE_WAIT ||
722				cs->type == CS_TYPE_COLLECTIVE_WAIT)
723			hl_fence_put(cs->signal_fence);
724
725		goto out;
726	}
727
728	/* Need to update CI for all queue jobs that does not get completion */
729	hl_hw_queue_update_ci(cs);
730
731	/* remove CS from CS mirror list */
732	spin_lock(&hdev->cs_mirror_lock);
733	list_del_init(&cs->mirror_node);
734	spin_unlock(&hdev->cs_mirror_lock);
735
736	cs_handle_tdr(hdev, cs);
737
738	if (cs->staged_cs) {
739		/* the completion CS decrements reference for the entire
740		 * staged submission
741		 */
742		if (cs->staged_last) {
743			struct hl_cs *staged_cs, *tmp_cs;
744
745			list_for_each_entry_safe(staged_cs, tmp_cs,
746					&cs->staged_cs_node, staged_cs_node)
747				staged_cs_put(hdev, staged_cs);
748		}
749
750		/* A staged CS will be a member in the list only after it
751		 * was submitted. We used 'cs_mirror_lock' when inserting
752		 * it to list so we will use it again when removing it
753		 */
754		if (cs->submitted) {
755			spin_lock(&hdev->cs_mirror_lock);
756			list_del(&cs->staged_cs_node);
757			spin_unlock(&hdev->cs_mirror_lock);
758		}
759
760		/* decrement refcount to handle when first staged cs
761		 * with encaps signals is completed.
762		 */
763		if (hl_cs_cmpl->encaps_signals)
764			kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount,
765					hl_encaps_release_handle_and_put_ctx);
766	}
767
768	if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
769		kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
770
771out:
772	/* Must be called before hl_ctx_put because inside we use ctx to get
773	 * the device
774	 */
775	hl_debugfs_remove_cs(cs);
776
777	hdev->shadow_cs_queue[cs->sequence & (hdev->asic_prop.max_pending_cs - 1)] = NULL;
778
779	/* We need to mark an error for not submitted because in that case
780	 * the hl fence release flow is different. Mainly, we don't need
781	 * to handle hw_sob for signal/wait
782	 */
783	if (cs->timedout)
784		cs->fence->error = -ETIMEDOUT;
785	else if (cs->aborted)
786		cs->fence->error = -EIO;
787	else if (!cs->submitted)
788		cs->fence->error = -EBUSY;
789
790	if (unlikely(cs->skip_reset_on_timeout)) {
791		dev_err(hdev->dev,
792			"Command submission %llu completed after %llu (s)\n",
793			cs->sequence,
794			div_u64(jiffies - cs->submission_time_jiffies, HZ));
795	}
796
797	if (cs->timestamp) {
798		cs->fence->timestamp = cs->completion_timestamp;
799		hl_push_cs_outcome(hdev, &cs->ctx->outcome_store, cs->sequence,
800				   cs->fence->timestamp, cs->fence->error);
801	}
802
803	hl_ctx_put(cs->ctx);
804
805	complete_all(&cs->fence->completion);
806	complete_multi_cs(hdev, cs);
807
808	cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl);
809
810	hl_fence_put(cs->fence);
811
812	kfree(cs->jobs_in_queue_cnt);
813	kfree(cs);
814}
815
816static void cs_timedout(struct work_struct *work)
817{
818	struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work);
819	bool skip_reset_on_timeout, device_reset = false;
820	struct hl_device *hdev;
821	u64 event_mask = 0x0;
822	uint timeout_sec;
823	int rc;
824
825	skip_reset_on_timeout = cs->skip_reset_on_timeout;
826
827	rc = cs_get_unless_zero(cs);
828	if (!rc)
829		return;
830
831	if ((!cs->submitted) || (cs->completed)) {
832		cs_put(cs);
833		return;
834	}
835
836	hdev = cs->ctx->hdev;
837
838	if (likely(!skip_reset_on_timeout)) {
839		if (hdev->reset_on_lockup)
840			device_reset = true;
841		else
842			hdev->reset_info.needs_reset = true;
843
844		/* Mark the CS is timed out so we won't try to cancel its TDR */
845		cs->timedout = true;
846	}
847
848	/* Save only the first CS timeout parameters */
849	rc = atomic_cmpxchg(&hdev->captured_err_info.cs_timeout.write_enable, 1, 0);
850	if (rc) {
851		hdev->captured_err_info.cs_timeout.timestamp = ktime_get();
852		hdev->captured_err_info.cs_timeout.seq = cs->sequence;
853		event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT;
854	}
855
856	timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000;
857
858	switch (cs->type) {
859	case CS_TYPE_SIGNAL:
860		dev_err(hdev->dev,
861			"Signal command submission %llu has not finished in %u seconds!\n",
862			cs->sequence, timeout_sec);
863		break;
864
865	case CS_TYPE_WAIT:
866		dev_err(hdev->dev,
867			"Wait command submission %llu has not finished in %u seconds!\n",
868			cs->sequence, timeout_sec);
869		break;
870
871	case CS_TYPE_COLLECTIVE_WAIT:
872		dev_err(hdev->dev,
873			"Collective Wait command submission %llu has not finished in %u seconds!\n",
874			cs->sequence, timeout_sec);
875		break;
876
877	default:
878		dev_err(hdev->dev,
879			"Command submission %llu has not finished in %u seconds!\n",
880			cs->sequence, timeout_sec);
881		break;
882	}
883
884	rc = hl_state_dump(hdev);
885	if (rc)
886		dev_err(hdev->dev, "Error during system state dump %d\n", rc);
887
888	cs_put(cs);
889
890	if (device_reset) {
891		event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
892		hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask);
893	} else if (event_mask) {
894		hl_notifier_event_send_all(hdev, event_mask);
895	}
896}
897
898static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
899			enum hl_cs_type cs_type, u64 user_sequence,
900			struct hl_cs **cs_new, u32 flags, u32 timeout)
901{
902	struct hl_cs_counters_atomic *cntr;
903	struct hl_fence *other = NULL;
904	struct hl_cs_compl *cs_cmpl;
905	struct hl_cs *cs;
906	int rc;
907
908	cntr = &hdev->aggregated_cs_counters;
909
910	cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
911	if (!cs)
912		cs = kzalloc(sizeof(*cs), GFP_KERNEL);
913
914	if (!cs) {
915		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
916		atomic64_inc(&cntr->out_of_mem_drop_cnt);
917		return -ENOMEM;
918	}
919
920	/* increment refcnt for context */
921	hl_ctx_get(ctx);
922
923	cs->ctx = ctx;
924	cs->submitted = false;
925	cs->completed = false;
926	cs->type = cs_type;
927	cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
928	cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
929	cs->timeout_jiffies = timeout;
930	cs->skip_reset_on_timeout =
931		hdev->reset_info.skip_reset_on_timeout ||
932		!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
933	cs->submission_time_jiffies = jiffies;
934	INIT_LIST_HEAD(&cs->job_list);
935	INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
936	kref_init(&cs->refcount);
937	spin_lock_init(&cs->job_lock);
938
939	cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_ATOMIC);
940	if (!cs_cmpl)
941		cs_cmpl = kzalloc(sizeof(*cs_cmpl), GFP_KERNEL);
942
943	if (!cs_cmpl) {
944		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
945		atomic64_inc(&cntr->out_of_mem_drop_cnt);
946		rc = -ENOMEM;
947		goto free_cs;
948	}
949
950	cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
951			sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
952	if (!cs->jobs_in_queue_cnt)
953		cs->jobs_in_queue_cnt = kcalloc(hdev->asic_prop.max_queues,
954				sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);
955
956	if (!cs->jobs_in_queue_cnt) {
957		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
958		atomic64_inc(&cntr->out_of_mem_drop_cnt);
959		rc = -ENOMEM;
960		goto free_cs_cmpl;
961	}
962
963	cs_cmpl->hdev = hdev;
964	cs_cmpl->type = cs->type;
965	spin_lock_init(&cs_cmpl->lock);
966	cs->fence = &cs_cmpl->base_fence;
967
968	spin_lock(&ctx->cs_lock);
969
970	cs_cmpl->cs_seq = ctx->cs_sequence;
971	other = ctx->cs_pending[cs_cmpl->cs_seq &
972				(hdev->asic_prop.max_pending_cs - 1)];
973
974	if (other && !completion_done(&other->completion)) {
975		/* If the following statement is true, it means we have reached
976		 * a point in which only part of the staged submission was
977		 * submitted and we don't have enough room in the 'cs_pending'
978		 * array for the rest of the submission.
979		 * This causes a deadlock because this CS will never be
980		 * completed as it depends on future CS's for completion.
981		 */
982		if (other->cs_sequence == user_sequence)
983			dev_crit_ratelimited(hdev->dev,
984				"Staged CS %llu deadlock due to lack of resources",
985				user_sequence);
986
987		dev_dbg_ratelimited(hdev->dev,
988			"Rejecting CS because of too many in-flights CS\n");
989		atomic64_inc(&ctx->cs_counters.max_cs_in_flight_drop_cnt);
990		atomic64_inc(&cntr->max_cs_in_flight_drop_cnt);
991		rc = -EAGAIN;
992		goto free_fence;
993	}
994
995	/* init hl_fence */
996	hl_fence_init(&cs_cmpl->base_fence, cs_cmpl->cs_seq);
997
998	cs->sequence = cs_cmpl->cs_seq;
999
1000	ctx->cs_pending[cs_cmpl->cs_seq &
1001			(hdev->asic_prop.max_pending_cs - 1)] =
1002							&cs_cmpl->base_fence;
1003	ctx->cs_sequence++;
1004
1005	hl_fence_get(&cs_cmpl->base_fence);
1006
1007	hl_fence_put(other);
1008
1009	spin_unlock(&ctx->cs_lock);
1010
1011	*cs_new = cs;
1012
1013	return 0;
1014
1015free_fence:
1016	spin_unlock(&ctx->cs_lock);
1017	kfree(cs->jobs_in_queue_cnt);
1018free_cs_cmpl:
1019	kfree(cs_cmpl);
1020free_cs:
1021	kfree(cs);
1022	hl_ctx_put(ctx);
1023	return rc;
1024}
1025
1026static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
1027{
1028	struct hl_cs_job *job, *tmp;
1029
1030	staged_cs_put(hdev, cs);
1031
1032	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
1033		hl_complete_job(hdev, job);
1034}
1035
1036/*
1037 * release_reserved_encaps_signals() - release reserved encapsulated signals.
1038 * @hdev: pointer to habanalabs device structure
1039 *
1040 * Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
1041 * encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
1042 * For these signals need also to put the refcount of the H/W SOB which was taken at the
1043 * reservation.
1044 */
1045static void release_reserved_encaps_signals(struct hl_device *hdev)
1046{
1047	struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
1048	struct hl_cs_encaps_sig_handle *handle;
1049	struct hl_encaps_signals_mgr *mgr;
1050	u32 id;
1051
1052	if (!ctx)
1053		return;
1054
1055	mgr = &ctx->sig_mgr;
1056
1057	idr_for_each_entry(&mgr->handles, handle, id)
1058		if (handle->cs_seq == ULLONG_MAX)
1059			kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx);
1060
1061	hl_ctx_put(ctx);
1062}
1063
1064void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
1065{
1066	int i;
1067	struct hl_cs *cs, *tmp;
1068
1069	if (!skip_wq_flush) {
1070		flush_workqueue(hdev->ts_free_obj_wq);
1071
1072		/* flush all completions before iterating over the CS mirror list in
1073		 * order to avoid a race with the release functions
1074		 */
1075		for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1076			flush_workqueue(hdev->cq_wq[i]);
1077
1078		flush_workqueue(hdev->cs_cmplt_wq);
1079	}
1080
1081	/* Make sure we don't have leftovers in the CS mirror list */
1082	list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) {
1083		cs_get(cs);
1084		cs->aborted = true;
1085		dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
1086					cs->ctx->asid, cs->sequence);
1087		cs_rollback(hdev, cs);
1088		cs_put(cs);
1089	}
1090
1091	force_complete_multi_cs(hdev);
1092
1093	release_reserved_encaps_signals(hdev);
1094}
1095
1096static void
1097wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
1098{
1099	struct hl_user_pending_interrupt *pend, *temp;
1100	unsigned long flags;
1101
1102	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
1103	list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, list_node) {
1104		pend->fence.error = -EIO;
1105		complete_all(&pend->fence.completion);
1106	}
1107	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
1108
1109	spin_lock_irqsave(&interrupt->ts_list_lock, flags);
1110	list_for_each_entry_safe(pend, temp, &interrupt->ts_list_head, list_node) {
1111		list_del(&pend->list_node);
1112		hl_mmap_mem_buf_put(pend->ts_reg_info.buf);
1113		hl_cb_put(pend->ts_reg_info.cq_cb);
1114	}
1115	spin_unlock_irqrestore(&interrupt->ts_list_lock, flags);
1116}
1117
1118void hl_release_pending_user_interrupts(struct hl_device *hdev)
1119{
1120	struct asic_fixed_properties *prop = &hdev->asic_prop;
1121	struct hl_user_interrupt *interrupt;
1122	int i;
1123
1124	if (!prop->user_interrupt_count)
1125		return;
1126
1127	/* We iterate through the user interrupt requests and waking up all
1128	 * user threads waiting for interrupt completion. We iterate the
1129	 * list under a lock, this is why all user threads, once awake,
1130	 * will wait on the same lock and will release the waiting object upon
1131	 * unlock.
1132	 */
1133
1134	for (i = 0 ; i < prop->user_interrupt_count ; i++) {
1135		interrupt = &hdev->user_interrupt[i];
1136		wake_pending_user_interrupt_threads(interrupt);
1137	}
1138
1139	interrupt = &hdev->common_user_cq_interrupt;
1140	wake_pending_user_interrupt_threads(interrupt);
1141
1142	interrupt = &hdev->common_decoder_interrupt;
1143	wake_pending_user_interrupt_threads(interrupt);
1144}
1145
1146static void force_complete_cs(struct hl_device *hdev)
1147{
1148	struct hl_cs *cs;
1149
1150	spin_lock(&hdev->cs_mirror_lock);
1151
1152	list_for_each_entry(cs, &hdev->cs_mirror_list, mirror_node) {
1153		cs->fence->error = -EIO;
1154		complete_all(&cs->fence->completion);
1155	}
1156
1157	spin_unlock(&hdev->cs_mirror_lock);
1158}
1159
1160void hl_abort_waiting_for_cs_completions(struct hl_device *hdev)
1161{
1162	force_complete_cs(hdev);
1163	force_complete_multi_cs(hdev);
1164}
1165
1166static void job_wq_completion(struct work_struct *work)
1167{
1168	struct hl_cs_job *job = container_of(work, struct hl_cs_job,
1169						finish_work);
1170	struct hl_cs *cs = job->cs;
1171	struct hl_device *hdev = cs->ctx->hdev;
1172
1173	/* job is no longer needed */
1174	hl_complete_job(hdev, job);
1175}
1176
1177static void cs_completion(struct work_struct *work)
1178{
1179	struct hl_cs *cs = container_of(work, struct hl_cs, finish_work);
1180	struct hl_device *hdev = cs->ctx->hdev;
1181	struct hl_cs_job *job, *tmp;
1182
1183	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
1184		hl_complete_job(hdev, job);
1185}
1186
1187u32 hl_get_active_cs_num(struct hl_device *hdev)
1188{
1189	u32 active_cs_num = 0;
1190	struct hl_cs *cs;
1191
1192	spin_lock(&hdev->cs_mirror_lock);
1193
1194	list_for_each_entry(cs, &hdev->cs_mirror_list, mirror_node)
1195		if (!cs->completed)
1196			active_cs_num++;
1197
1198	spin_unlock(&hdev->cs_mirror_lock);
1199
1200	return active_cs_num;
1201}
1202
1203static int validate_queue_index(struct hl_device *hdev,
1204				struct hl_cs_chunk *chunk,
1205				enum hl_queue_type *queue_type,
1206				bool *is_kernel_allocated_cb)
1207{
1208	struct asic_fixed_properties *asic = &hdev->asic_prop;
1209	struct hw_queue_properties *hw_queue_prop;
1210
1211	/* This must be checked here to prevent out-of-bounds access to
1212	 * hw_queues_props array
1213	 */
1214	if (chunk->queue_index >= asic->max_queues) {
1215		dev_err(hdev->dev, "Queue index %d is invalid\n",
1216			chunk->queue_index);
1217		return -EINVAL;
1218	}
1219
1220	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
1221
1222	if (hw_queue_prop->type == QUEUE_TYPE_NA) {
1223		dev_err(hdev->dev, "Queue index %d is not applicable\n",
1224			chunk->queue_index);
1225		return -EINVAL;
1226	}
1227
1228	if (hw_queue_prop->binned) {
1229		dev_err(hdev->dev, "Queue index %d is binned out\n",
1230			chunk->queue_index);
1231		return -EINVAL;
1232	}
1233
1234	if (hw_queue_prop->driver_only) {
1235		dev_err(hdev->dev,
1236			"Queue index %d is restricted for the kernel driver\n",
1237			chunk->queue_index);
1238		return -EINVAL;
1239	}
1240
1241	/* When hw queue type isn't QUEUE_TYPE_HW,
1242	 * USER_ALLOC_CB flag shall be referred as "don't care".
1243	 */
1244	if (hw_queue_prop->type == QUEUE_TYPE_HW) {
1245		if (chunk->cs_chunk_flags & HL_CS_CHUNK_FLAGS_USER_ALLOC_CB) {
1246			if (!(hw_queue_prop->cb_alloc_flags & CB_ALLOC_USER)) {
1247				dev_err(hdev->dev,
1248					"Queue index %d doesn't support user CB\n",
1249					chunk->queue_index);
1250				return -EINVAL;
1251			}
1252
1253			*is_kernel_allocated_cb = false;
1254		} else {
1255			if (!(hw_queue_prop->cb_alloc_flags &
1256					CB_ALLOC_KERNEL)) {
1257				dev_err(hdev->dev,
1258					"Queue index %d doesn't support kernel CB\n",
1259					chunk->queue_index);
1260				return -EINVAL;
1261			}
1262
1263			*is_kernel_allocated_cb = true;
1264		}
1265	} else {
1266		*is_kernel_allocated_cb = !!(hw_queue_prop->cb_alloc_flags
1267						& CB_ALLOC_KERNEL);
1268	}
1269
1270	*queue_type = hw_queue_prop->type;
1271	return 0;
1272}
1273
1274static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
1275					struct hl_mem_mgr *mmg,
1276					struct hl_cs_chunk *chunk)
1277{
1278	struct hl_cb *cb;
1279
1280	cb = hl_cb_get(mmg, chunk->cb_handle);
1281	if (!cb) {
1282		dev_err(hdev->dev, "CB handle 0x%llx invalid\n", chunk->cb_handle);
1283		return NULL;
1284	}
1285
1286	if ((chunk->cb_size < 8) || (chunk->cb_size > cb->size)) {
1287		dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size);
1288		goto release_cb;
1289	}
1290
1291	atomic_inc(&cb->cs_cnt);
1292
1293	return cb;
1294
1295release_cb:
1296	hl_cb_put(cb);
1297	return NULL;
1298}
1299
1300struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
1301		enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
1302{
1303	struct hl_cs_job *job;
1304
1305	job = kzalloc(sizeof(*job), GFP_ATOMIC);
1306	if (!job)
1307		job = kzalloc(sizeof(*job), GFP_KERNEL);
1308
1309	if (!job)
1310		return NULL;
1311
1312	kref_init(&job->refcount);
1313	job->queue_type = queue_type;
1314	job->is_kernel_allocated_cb = is_kernel_allocated_cb;
1315
1316	if (is_cb_patched(hdev, job))
1317		INIT_LIST_HEAD(&job->userptr_list);
1318
1319	if (job->queue_type == QUEUE_TYPE_EXT)
1320		INIT_WORK(&job->finish_work, job_wq_completion);
1321
1322	return job;
1323}
1324
1325static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
1326{
1327	if (cs_type_flags & HL_CS_FLAGS_SIGNAL)
1328		return CS_TYPE_SIGNAL;
1329	else if (cs_type_flags & HL_CS_FLAGS_WAIT)
1330		return CS_TYPE_WAIT;
1331	else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
1332		return CS_TYPE_COLLECTIVE_WAIT;
1333	else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY)
1334		return CS_RESERVE_SIGNALS;
1335	else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
1336		return CS_UNRESERVE_SIGNALS;
1337	else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND)
1338		return CS_TYPE_ENGINE_CORE;
1339	else if (cs_type_flags & HL_CS_FLAGS_ENGINES_COMMAND)
1340		return CS_TYPE_ENGINES;
1341	else if (cs_type_flags & HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES)
1342		return CS_TYPE_FLUSH_PCI_HBW_WRITES;
1343	else
1344		return CS_TYPE_DEFAULT;
1345}
1346
1347static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
1348{
1349	struct hl_device *hdev = hpriv->hdev;
1350	struct hl_ctx *ctx = hpriv->ctx;
1351	u32 cs_type_flags, num_chunks;
1352	enum hl_device_status status;
1353	enum hl_cs_type cs_type;
1354	bool is_sync_stream;
1355	int i;
1356
1357	for (i = 0 ; i < sizeof(args->in.pad) ; i++)
1358		if (args->in.pad[i]) {
1359			dev_dbg(hdev->dev, "Padding bytes must be 0\n");
1360			return -EINVAL;
1361		}
1362
1363	if (!hl_device_operational(hdev, &status))
1364		return -EBUSY;
1365
1366	if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1367			!hdev->supports_staged_submission) {
1368		dev_err(hdev->dev, "staged submission not supported");
1369		return -EPERM;
1370	}
1371
1372	cs_type_flags = args->in.cs_flags & HL_CS_FLAGS_TYPE_MASK;
1373
1374	if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) {
1375		dev_err(hdev->dev,
1376			"CS type flags are mutually exclusive, context %d\n",
1377			ctx->asid);
1378		return -EINVAL;
1379	}
1380
1381	cs_type = hl_cs_get_cs_type(cs_type_flags);
1382	num_chunks = args->in.num_chunks_execute;
1383
1384	is_sync_stream = (cs_type == CS_TYPE_SIGNAL || cs_type == CS_TYPE_WAIT ||
1385			cs_type == CS_TYPE_COLLECTIVE_WAIT);
1386
1387	if (unlikely(is_sync_stream && !hdev->supports_sync_stream)) {
1388		dev_err(hdev->dev, "Sync stream CS is not supported\n");
1389		return -EINVAL;
1390	}
1391
1392	if (cs_type == CS_TYPE_DEFAULT) {
1393		if (!num_chunks) {
1394			dev_err(hdev->dev, "Got execute CS with 0 chunks, context %d\n", ctx->asid);
1395			return -EINVAL;
1396		}
1397	} else if (is_sync_stream && num_chunks != 1) {
1398		dev_err(hdev->dev,
1399			"Sync stream CS mandates one chunk only, context %d\n",
1400			ctx->asid);
1401		return -EINVAL;
1402	}
1403
1404	return 0;
1405}
1406
1407static int hl_cs_copy_chunk_array(struct hl_device *hdev,
1408					struct hl_cs_chunk **cs_chunk_array,
1409					void __user *chunks, u32 num_chunks,
1410					struct hl_ctx *ctx)
1411{
1412	u32 size_to_copy;
1413
1414	if (num_chunks > HL_MAX_JOBS_PER_CS) {
1415		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1416		atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1417		dev_err(hdev->dev,
1418			"Number of chunks can NOT be larger than %d\n",
1419			HL_MAX_JOBS_PER_CS);
1420		return -EINVAL;
1421	}
1422
1423	*cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array),
1424					GFP_ATOMIC);
1425	if (!*cs_chunk_array)
1426		*cs_chunk_array = kmalloc_array(num_chunks,
1427					sizeof(**cs_chunk_array), GFP_KERNEL);
1428	if (!*cs_chunk_array) {
1429		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1430		atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1431		return -ENOMEM;
1432	}
1433
1434	size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
1435	if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) {
1436		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1437		atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1438		dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
1439		kfree(*cs_chunk_array);
1440		return -EFAULT;
1441	}
1442
1443	return 0;
1444}
1445
1446static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
1447				u64 sequence, u32 flags,
1448				u32 encaps_signal_handle)
1449{
1450	if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION))
1451		return 0;
1452
1453	cs->staged_last = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_LAST);
1454	cs->staged_first = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST);
1455
1456	if (cs->staged_first) {
1457		/* Staged CS sequence is the first CS sequence */
1458		INIT_LIST_HEAD(&cs->staged_cs_node);
1459		cs->staged_sequence = cs->sequence;
1460
1461		if (cs->encaps_signals)
1462			cs->encaps_sig_hdl_id = encaps_signal_handle;
1463	} else {
1464		/* User sequence will be validated in 'hl_hw_queue_schedule_cs'
1465		 * under the cs_mirror_lock
1466		 */
1467		cs->staged_sequence = sequence;
1468	}
1469
1470	/* Increment CS reference if needed */
1471	staged_cs_get(hdev, cs);
1472
1473	cs->staged_cs = true;
1474
1475	return 0;
1476}
1477
1478static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
1479{
1480	int i;
1481
1482	for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
1483		if (qid == hdev->stream_master_qid_arr[i])
1484			return BIT(i);
1485
1486	return 0;
1487}
1488
1489static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
1490				u32 num_chunks, u64 *cs_seq, u32 flags,
1491				u32 encaps_signals_handle, u32 timeout,
1492				u16 *signal_initial_sob_count)
1493{
1494	bool staged_mid, int_queues_only = true, using_hw_queues = false;
1495	struct hl_device *hdev = hpriv->hdev;
1496	struct hl_cs_chunk *cs_chunk_array;
1497	struct hl_cs_counters_atomic *cntr;
1498	struct hl_ctx *ctx = hpriv->ctx;
1499	struct hl_cs_job *job;
1500	struct hl_cs *cs;
1501	struct hl_cb *cb;
1502	u64 user_sequence;
1503	u8 stream_master_qid_map = 0;
1504	int rc, i;
1505
1506	cntr = &hdev->aggregated_cs_counters;
1507	user_sequence = *cs_seq;
1508	*cs_seq = ULLONG_MAX;
1509
1510	rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
1511			hpriv->ctx);
1512	if (rc)
1513		goto out;
1514
1515	if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1516			!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
1517		staged_mid = true;
1518	else
1519		staged_mid = false;
1520
1521	rc = allocate_cs(hdev, hpriv->ctx, CS_TYPE_DEFAULT,
1522			staged_mid ? user_sequence : ULLONG_MAX, &cs, flags,
1523			timeout);
1524	if (rc)
1525		goto free_cs_chunk_array;
1526
1527	*cs_seq = cs->sequence;
1528
1529	hl_debugfs_add_cs(cs);
1530
1531	rc = cs_staged_submission(hdev, cs, user_sequence, flags,
1532						encaps_signals_handle);
1533	if (rc)
1534		goto free_cs_object;
1535
1536	/* If this is a staged submission we must return the staged sequence
1537	 * rather than the internal CS sequence
1538	 */
1539	if (cs->staged_cs)
1540		*cs_seq = cs->staged_sequence;
1541
1542	/* Validate ALL the CS chunks before submitting the CS */
1543	for (i = 0 ; i < num_chunks ; i++) {
1544		struct hl_cs_chunk *chunk = &cs_chunk_array[i];
1545		enum hl_queue_type queue_type;
1546		bool is_kernel_allocated_cb;
1547
1548		rc = validate_queue_index(hdev, chunk, &queue_type,
1549						&is_kernel_allocated_cb);
1550		if (rc) {
1551			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1552			atomic64_inc(&cntr->validation_drop_cnt);
1553			goto free_cs_object;
1554		}
1555
1556		if (is_kernel_allocated_cb) {
1557			cb = get_cb_from_cs_chunk(hdev, &hpriv->mem_mgr, chunk);
1558			if (!cb) {
1559				atomic64_inc(
1560					&ctx->cs_counters.validation_drop_cnt);
1561				atomic64_inc(&cntr->validation_drop_cnt);
1562				rc = -EINVAL;
1563				goto free_cs_object;
1564			}
1565		} else {
1566			cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
1567		}
1568
1569		if (queue_type == QUEUE_TYPE_EXT ||
1570						queue_type == QUEUE_TYPE_HW) {
1571			int_queues_only = false;
1572
1573			/*
1574			 * store which stream are being used for external/HW
1575			 * queues of this CS
1576			 */
1577			if (hdev->supports_wait_for_multi_cs)
1578				stream_master_qid_map |=
1579					get_stream_master_qid_mask(hdev,
1580							chunk->queue_index);
1581		}
1582
1583		if (queue_type == QUEUE_TYPE_HW)
1584			using_hw_queues = true;
1585
1586		job = hl_cs_allocate_job(hdev, queue_type,
1587						is_kernel_allocated_cb);
1588		if (!job) {
1589			atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1590			atomic64_inc(&cntr->out_of_mem_drop_cnt);
1591			dev_err(hdev->dev, "Failed to allocate a new job\n");
1592			rc = -ENOMEM;
1593			if (is_kernel_allocated_cb)
1594				goto release_cb;
1595
1596			goto free_cs_object;
1597		}
1598
1599		job->id = i + 1;
1600		job->cs = cs;
1601		job->user_cb = cb;
1602		job->user_cb_size = chunk->cb_size;
1603		job->hw_queue_id = chunk->queue_index;
1604
1605		cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1606		cs->jobs_cnt++;
1607
1608		list_add_tail(&job->cs_node, &cs->job_list);
1609
1610		/*
1611		 * Increment CS reference. When CS reference is 0, CS is
1612		 * done and can be signaled to user and free all its resources
1613		 * Only increment for JOB on external or H/W queues, because
1614		 * only for those JOBs we get completion
1615		 */
1616		if (cs_needs_completion(cs) &&
1617			(job->queue_type == QUEUE_TYPE_EXT ||
1618				job->queue_type == QUEUE_TYPE_HW))
1619			cs_get(cs);
1620
1621		hl_debugfs_add_job(hdev, job);
1622
1623		rc = cs_parser(hpriv, job);
1624		if (rc) {
1625			atomic64_inc(&ctx->cs_counters.parsing_drop_cnt);
1626			atomic64_inc(&cntr->parsing_drop_cnt);
1627			dev_err(hdev->dev,
1628				"Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
1629				cs->ctx->asid, cs->sequence, job->id, rc);
1630			goto free_cs_object;
1631		}
1632	}
1633
1634	/* We allow a CS with any queue type combination as long as it does
1635	 * not get a completion
1636	 */
1637	if (int_queues_only && cs_needs_completion(cs)) {
1638		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1639		atomic64_inc(&cntr->validation_drop_cnt);
1640		dev_err(hdev->dev,
1641			"Reject CS %d.%llu since it contains only internal queues jobs and needs completion\n",
1642			cs->ctx->asid, cs->sequence);
1643		rc = -EINVAL;
1644		goto free_cs_object;
1645	}
1646
1647	if (using_hw_queues)
1648		INIT_WORK(&cs->finish_work, cs_completion);
1649
1650	/*
1651	 * store the (external/HW queues) streams used by the CS in the
1652	 * fence object for multi-CS completion
1653	 */
1654	if (hdev->supports_wait_for_multi_cs)
1655		cs->fence->stream_master_qid_map = stream_master_qid_map;
1656
1657	rc = hl_hw_queue_schedule_cs(cs);
1658	if (rc) {
1659		if (rc != -EAGAIN)
1660			dev_err(hdev->dev,
1661				"Failed to submit CS %d.%llu to H/W queues, error %d\n",
1662				cs->ctx->asid, cs->sequence, rc);
1663		goto free_cs_object;
1664	}
1665
1666	*signal_initial_sob_count = cs->initial_sob_count;
1667
1668	rc = HL_CS_STATUS_SUCCESS;
1669	goto put_cs;
1670
1671release_cb:
1672	atomic_dec(&cb->cs_cnt);
1673	hl_cb_put(cb);
1674free_cs_object:
1675	cs_rollback(hdev, cs);
1676	*cs_seq = ULLONG_MAX;
1677	/* The path below is both for good and erroneous exits */
1678put_cs:
1679	/* We finished with the CS in this function, so put the ref */
1680	cs_put(cs);
1681free_cs_chunk_array:
1682	kfree(cs_chunk_array);
1683out:
1684	return rc;
1685}
1686
1687static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
1688				u64 *cs_seq)
1689{
1690	struct hl_device *hdev = hpriv->hdev;
1691	struct hl_ctx *ctx = hpriv->ctx;
1692	bool need_soft_reset = false;
1693	int rc = 0, do_ctx_switch = 0;
1694	void __user *chunks;
1695	u32 num_chunks, tmp;
1696	u16 sob_count;
1697	int ret;
1698
1699	if (hdev->supports_ctx_switch)
1700		do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
1701
1702	if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
1703		mutex_lock(&hpriv->restore_phase_mutex);
1704
1705		if (do_ctx_switch) {
1706			rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
1707			if (rc) {
1708				dev_err_ratelimited(hdev->dev,
1709					"Failed to switch to context %d, rejecting CS! %d\n",
1710					ctx->asid, rc);
1711				/*
1712				 * If we timedout, or if the device is not IDLE
1713				 * while we want to do context-switch (-EBUSY),
1714				 * we need to soft-reset because QMAN is
1715				 * probably stuck. However, we can't call to
1716				 * reset here directly because of deadlock, so
1717				 * need to do it at the very end of this
1718				 * function
1719				 */
1720				if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
1721					need_soft_reset = true;
1722				mutex_unlock(&hpriv->restore_phase_mutex);
1723				goto out;
1724			}
1725		}
1726
1727		hdev->asic_funcs->restore_phase_topology(hdev);
1728
1729		chunks = (void __user *) (uintptr_t) args->in.chunks_restore;
1730		num_chunks = args->in.num_chunks_restore;
1731
1732		if (!num_chunks) {
1733			dev_dbg(hdev->dev,
1734				"Need to run restore phase but restore CS is empty\n");
1735			rc = 0;
1736		} else {
1737			rc = cs_ioctl_default(hpriv, chunks, num_chunks,
1738					cs_seq, 0, 0, hdev->timeout_jiffies, &sob_count);
1739		}
1740
1741		mutex_unlock(&hpriv->restore_phase_mutex);
1742
1743		if (rc) {
1744			dev_err(hdev->dev,
1745				"Failed to submit restore CS for context %d (%d)\n",
1746				ctx->asid, rc);
1747			goto out;
1748		}
1749
1750		/* Need to wait for restore completion before execution phase */
1751		if (num_chunks) {
1752			enum hl_cs_wait_status status;
1753
1754			ret = _hl_cs_wait_ioctl(hdev, ctx,
1755					jiffies_to_usecs(hdev->timeout_jiffies),
1756					*cs_seq, &status, NULL);
1757			if (ret) {
1758				dev_err(hdev->dev,
1759					"Restore CS for context %d failed to complete %d\n",
1760					ctx->asid, ret);
1761				rc = -ENOEXEC;
1762				goto out;
1763			}
1764		}
1765
1766		if (hdev->supports_ctx_switch)
1767			ctx->thread_ctx_switch_wait_token = 1;
1768
1769	} else if (hdev->supports_ctx_switch && !ctx->thread_ctx_switch_wait_token) {
1770		rc = hl_poll_timeout_memory(hdev,
1771			&ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1),
1772			100, jiffies_to_usecs(hdev->timeout_jiffies), false);
1773
1774		if (rc == -ETIMEDOUT) {
1775			dev_err(hdev->dev,
1776				"context switch phase timeout (%d)\n", tmp);
1777			goto out;
1778		}
1779	}
1780
1781out:
1782	if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset))
1783		hl_device_reset(hdev, 0);
1784
1785	return rc;
1786}
1787
1788/*
1789 * hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
1790 * if the SOB value reaches the max value move to the other SOB reserved
1791 * to the queue.
1792 * @hdev: pointer to device structure
1793 * @q_idx: stream queue index
1794 * @hw_sob: the H/W SOB used in this signal CS.
1795 * @count: signals count
1796 * @encaps_sig: tells whether it's reservation for encaps signals or not.
1797 *
1798 * Note that this function must be called while hw_queues_lock is taken.
1799 */
1800int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
1801			struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig)
1802
1803{
1804	struct hl_sync_stream_properties *prop;
1805	struct hl_hw_sob *sob = *hw_sob, *other_sob;
1806	u8 other_sob_offset;
1807
1808	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
1809
1810	hw_sob_get(sob);
1811
1812	/* check for wraparound */
1813	if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) {
1814		/*
1815		 * Decrement as we reached the max value.
1816		 * The release function won't be called here as we've
1817		 * just incremented the refcount right before calling this
1818		 * function.
1819		 */
1820		hw_sob_put_err(sob);
1821
1822		/*
1823		 * check the other sob value, if it still in use then fail
1824		 * otherwise make the switch
1825		 */
1826		other_sob_offset = (prop->curr_sob_offset + 1) % HL_RSVD_SOBS;
1827		other_sob = &prop->hw_sob[other_sob_offset];
1828
1829		if (kref_read(&other_sob->kref) != 1) {
1830			dev_err(hdev->dev, "error: Cannot switch SOBs q_idx: %d\n",
1831								q_idx);
1832			return -EINVAL;
1833		}
1834
1835		/*
1836		 * next_sob_val always points to the next available signal
1837		 * in the sob, so in encaps signals it will be the next one
1838		 * after reserving the required amount.
1839		 */
1840		if (encaps_sig)
1841			prop->next_sob_val = count + 1;
1842		else
1843			prop->next_sob_val = count;
1844
1845		/* only two SOBs are currently in use */
1846		prop->curr_sob_offset = other_sob_offset;
1847		*hw_sob = other_sob;
1848
1849		/*
1850		 * check if other_sob needs reset, then do it before using it
1851		 * for the reservation or the next signal cs.
1852		 * we do it here, and for both encaps and regular signal cs
1853		 * cases in order to avoid possible races of two kref_put
1854		 * of the sob which can occur at the same time if we move the
1855		 * sob reset(kref_put) to cs_do_release function.
1856		 * in addition, if we have combination of cs signal and
1857		 * encaps, and at the point we need to reset the sob there was
1858		 * no more reservations and only signal cs keep coming,
1859		 * in such case we need signal_cs to put the refcount and
1860		 * reset the sob.
1861		 */
1862		if (other_sob->need_reset)
1863			hw_sob_put(other_sob);
1864
1865		if (encaps_sig) {
1866			/* set reset indication for the sob */
1867			sob->need_reset = true;
1868			hw_sob_get(other_sob);
1869		}
1870
1871		dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
1872				prop->curr_sob_offset, q_idx);
1873	} else {
1874		prop->next_sob_val += count;
1875	}
1876
1877	return 0;
1878}
1879
1880static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
1881		struct hl_cs_chunk *chunk, u64 *signal_seq, struct hl_ctx *ctx,
1882		bool encaps_signals)
1883{
1884	u64 *signal_seq_arr = NULL;
1885	u32 size_to_copy, signal_seq_arr_len;
1886	int rc = 0;
1887
1888	if (encaps_signals) {
1889		*signal_seq = chunk->encaps_signal_seq;
1890		return 0;
1891	}
1892
1893	signal_seq_arr_len = chunk->num_signal_seq_arr;
1894
1895	/* currently only one signal seq is supported */
1896	if (signal_seq_arr_len != 1) {
1897		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1898		atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1899		dev_err(hdev->dev,
1900			"Wait for signal CS supports only one signal CS seq\n");
1901		return -EINVAL;
1902	}
1903
1904	signal_seq_arr = kmalloc_array(signal_seq_arr_len,
1905					sizeof(*signal_seq_arr),
1906					GFP_ATOMIC);
1907	if (!signal_seq_arr)
1908		signal_seq_arr = kmalloc_array(signal_seq_arr_len,
1909					sizeof(*signal_seq_arr),
1910					GFP_KERNEL);
1911	if (!signal_seq_arr) {
1912		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1913		atomic64_inc(&hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1914		return -ENOMEM;
1915	}
1916
1917	size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr);
1918	if (copy_from_user(signal_seq_arr,
1919				u64_to_user_ptr(chunk->signal_seq_arr),
1920				size_to_copy)) {
1921		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
1922		atomic64_inc(&hdev->aggregated_cs_counters.validation_drop_cnt);
1923		dev_err(hdev->dev,
1924			"Failed to copy signal seq array from user\n");
1925		rc = -EFAULT;
1926		goto out;
1927	}
1928
1929	/* currently it is guaranteed to have only one signal seq */
1930	*signal_seq = signal_seq_arr[0];
1931
1932out:
1933	kfree(signal_seq_arr);
1934
1935	return rc;
1936}
1937
1938static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
1939		struct hl_ctx *ctx, struct hl_cs *cs,
1940		enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset)
1941{
1942	struct hl_cs_counters_atomic *cntr;
1943	struct hl_cs_job *job;
1944	struct hl_cb *cb;
1945	u32 cb_size;
1946
1947	cntr = &hdev->aggregated_cs_counters;
1948
1949	job = hl_cs_allocate_job(hdev, q_type, true);
1950	if (!job) {
1951		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1952		atomic64_inc(&cntr->out_of_mem_drop_cnt);
1953		dev_err(hdev->dev, "Failed to allocate a new job\n");
1954		return -ENOMEM;
1955	}
1956
1957	if (cs->type == CS_TYPE_WAIT)
1958		cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
1959	else
1960		cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
1961
1962	cb = hl_cb_kernel_create(hdev, cb_size, q_type == QUEUE_TYPE_HW);
1963	if (!cb) {
1964		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
1965		atomic64_inc(&cntr->out_of_mem_drop_cnt);
1966		kfree(job);
1967		return -EFAULT;
1968	}
1969
1970	job->id = 0;
1971	job->cs = cs;
1972	job->user_cb = cb;
1973	atomic_inc(&job->user_cb->cs_cnt);
1974	job->user_cb_size = cb_size;
1975	job->hw_queue_id = q_idx;
1976
1977	if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT)
1978			&& cs->encaps_signals)
1979		job->encaps_sig_wait_offset = encaps_signal_offset;
1980	/*
1981	 * No need in parsing, user CB is the patched CB.
1982	 * We call hl_cb_destroy() out of two reasons - we don't need the CB in
1983	 * the CB idr anymore and to decrement its refcount as it was
1984	 * incremented inside hl_cb_kernel_create().
1985	 */
1986	job->patched_cb = job->user_cb;
1987	job->job_cb_size = job->user_cb_size;
1988	hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle);
1989
1990	/* increment refcount as for external queues we get completion */
1991	cs_get(cs);
1992
1993	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1994	cs->jobs_cnt++;
1995
1996	list_add_tail(&job->cs_node, &cs->job_list);
1997
1998	hl_debugfs_add_job(hdev, job);
1999
2000	return 0;
2001}
2002
2003static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
2004				u32 q_idx, u32 count,
2005				u32 *handle_id, u32 *sob_addr,
2006				u32 *signals_count)
2007{
2008	struct hw_queue_properties *hw_queue_prop;
2009	struct hl_sync_stream_properties *prop;
2010	struct hl_device *hdev = hpriv->hdev;
2011	struct hl_cs_encaps_sig_handle *handle;
2012	struct hl_encaps_signals_mgr *mgr;
2013	struct hl_hw_sob *hw_sob;
2014	int hdl_id;
2015	int rc = 0;
2016
2017	if (count >= HL_MAX_SOB_VAL) {
2018		dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n",
2019						count);
2020		rc = -EINVAL;
2021		goto out;
2022	}
2023
2024	if (q_idx >= hdev->asic_prop.max_queues) {
2025		dev_err(hdev->dev, "Queue index %d is invalid\n",
2026			q_idx);
2027		rc = -EINVAL;
2028		goto out;
2029	}
2030
2031	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
2032
2033	if (!hw_queue_prop->supports_sync_stream) {
2034		dev_err(hdev->dev,
2035			"Queue index %d does not support sync stream operations\n",
2036									q_idx);
2037		rc = -EINVAL;
2038		goto out;
2039	}
2040
2041	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
2042
2043	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
2044	if (!handle) {
2045		rc = -ENOMEM;
2046		goto out;
2047	}
2048
2049	handle->count = count;
2050
2051	hl_ctx_get(hpriv->ctx);
2052	handle->ctx = hpriv->ctx;
2053	mgr = &hpriv->ctx->sig_mgr;
2054
2055	spin_lock(&mgr->lock);
2056	hdl_id = idr_alloc(&mgr->handles, handle, 1, 0, GFP_ATOMIC);
2057	spin_unlock(&mgr->lock);
2058
2059	if (hdl_id < 0) {
2060		dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
2061		rc = -EINVAL;
2062		goto put_ctx;
2063	}
2064
2065	handle->id = hdl_id;
2066	handle->q_idx = q_idx;
2067	handle->hdev = hdev;
2068	kref_init(&handle->refcount);
2069
2070	hdev->asic_funcs->hw_queues_lock(hdev);
2071
2072	hw_sob = &prop->hw_sob[prop->curr_sob_offset];
2073
2074	/*
2075	 * Increment the SOB value by count by user request
2076	 * to reserve those signals
2077	 * check if the signals amount to reserve is not exceeding the max sob
2078	 * value, if yes then switch sob.
2079	 */
2080	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, count,
2081								true);
2082	if (rc) {
2083		dev_err(hdev->dev, "Failed to switch SOB\n");
2084		hdev->asic_funcs->hw_queues_unlock(hdev);
2085		rc = -EINVAL;
2086		goto remove_idr;
2087	}
2088	/* set the hw_sob to the handle after calling the sob wraparound handler
2089	 * since sob could have changed.
2090	 */
2091	handle->hw_sob = hw_sob;
2092
2093	/* store the current sob value for unreserve validity check, and
2094	 * signal offset support
2095	 */
2096	handle->pre_sob_val = prop->next_sob_val - handle->count;
2097
2098	handle->cs_seq = ULLONG_MAX;
2099
2100	*signals_count = prop->next_sob_val;
2101	hdev->asic_funcs->hw_queues_unlock(hdev);
2102
2103	*sob_addr = handle->hw_sob->sob_addr;
2104	*handle_id = hdl_id;
2105
2106	dev_dbg(hdev->dev,
2107		"Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n",
2108			hw_sob->sob_id, handle->hw_sob->sob_addr,
2109			prop->next_sob_val - 1, q_idx, hdl_id);
2110	goto out;
2111
2112remove_idr:
2113	spin_lock(&mgr->lock);
2114	idr_remove(&mgr->handles, hdl_id);
2115	spin_unlock(&mgr->lock);
2116
2117put_ctx:
2118	hl_ctx_put(handle->ctx);
2119	kfree(handle);
2120
2121out:
2122	return rc;
2123}
2124
2125static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
2126{
2127	struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
2128	struct hl_sync_stream_properties *prop;
2129	struct hl_device *hdev = hpriv->hdev;
2130	struct hl_encaps_signals_mgr *mgr;
2131	struct hl_hw_sob *hw_sob;
2132	u32 q_idx, sob_addr;
2133	int rc = 0;
2134
2135	mgr = &hpriv->ctx->sig_mgr;
2136
2137	spin_lock(&mgr->lock);
2138	encaps_sig_hdl = idr_find(&mgr->handles, handle_id);
2139	if (encaps_sig_hdl) {
2140		dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n",
2141				handle_id, encaps_sig_hdl->hw_sob->sob_addr,
2142					encaps_sig_hdl->count);
2143
2144		hdev->asic_funcs->hw_queues_lock(hdev);
2145
2146		q_idx = encaps_sig_hdl->q_idx;
2147		prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
2148		hw_sob = &prop->hw_sob[prop->curr_sob_offset];
2149		sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
2150
2151		/* Check if sob_val got out of sync due to other
2152		 * signal submission requests which were handled
2153		 * between the reserve-unreserve calls or SOB switch
2154		 * upon reaching SOB max value.
2155		 */
2156		if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count
2157				!= prop->next_sob_val ||
2158				sob_addr != encaps_sig_hdl->hw_sob->sob_addr) {
2159			dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n",
2160				encaps_sig_hdl->pre_sob_val,
2161				(prop->next_sob_val - encaps_sig_hdl->count));
2162
2163			hdev->asic_funcs->hw_queues_unlock(hdev);
2164			rc = -EINVAL;
2165			goto out_unlock;
2166		}
2167
2168		/*
2169		 * Decrement the SOB value by count by user request
2170		 * to unreserve those signals
2171		 */
2172		prop->next_sob_val -= encaps_sig_hdl->count;
2173
2174		hdev->asic_funcs->hw_queues_unlock(hdev);
2175
2176		hw_sob_put(hw_sob);
2177
2178		/* Release the id and free allocated memory of the handle */
2179		idr_remove(&mgr->handles, handle_id);
2180
2181		/* unlock before calling ctx_put, where we might sleep */
2182		spin_unlock(&mgr->lock);
2183		hl_ctx_put(encaps_sig_hdl->ctx);
2184		kfree(encaps_sig_hdl);
2185		goto out;
2186	} else {
2187		rc = -EINVAL;
2188		dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
2189	}
2190
2191out_unlock:
2192	spin_unlock(&mgr->lock);
2193
2194out:
2195	return rc;
2196}
2197
2198static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
2199				void __user *chunks, u32 num_chunks,
2200				u64 *cs_seq, u32 flags, u32 timeout,
2201				u32 *signal_sob_addr_offset, u16 *signal_initial_sob_count)
2202{
2203	struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
2204	bool handle_found = false, is_wait_cs = false,
2205			wait_cs_submitted = false,
2206			cs_encaps_signals = false;
2207	struct hl_cs_chunk *cs_chunk_array, *chunk;
2208	bool staged_cs_with_encaps_signals = false;
2209	struct hw_queue_properties *hw_queue_prop;
2210	struct hl_device *hdev = hpriv->hdev;
2211	struct hl_cs_compl *sig_waitcs_cmpl;
2212	u32 q_idx, collective_engine_id = 0;
2213	struct hl_cs_counters_atomic *cntr;
2214	struct hl_fence *sig_fence = NULL;
2215	struct hl_ctx *ctx = hpriv->ctx;
2216	enum hl_queue_type q_type;
2217	struct hl_cs *cs;
2218	u64 signal_seq;
2219	int rc;
2220
2221	cntr = &hdev->aggregated_cs_counters;
2222	*cs_seq = ULLONG_MAX;
2223
2224	rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks,
2225			ctx);
2226	if (rc)
2227		goto out;
2228
2229	/* currently it is guaranteed to have only one chunk */
2230	chunk = &cs_chunk_array[0];
2231
2232	if (chunk->queue_index >= hdev->asic_prop.max_queues) {
2233		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2234		atomic64_inc(&cntr->validation_drop_cnt);
2235		dev_err(hdev->dev, "Queue index %d is invalid\n",
2236			chunk->queue_index);
2237		rc = -EINVAL;
2238		goto free_cs_chunk_array;
2239	}
2240
2241	q_idx = chunk->queue_index;
2242	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
2243	q_type = hw_queue_prop->type;
2244
2245	if (!hw_queue_prop->supports_sync_stream) {
2246		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2247		atomic64_inc(&cntr->validation_drop_cnt);
2248		dev_err(hdev->dev,
2249			"Queue index %d does not support sync stream operations\n",
2250			q_idx);
2251		rc = -EINVAL;
2252		goto free_cs_chunk_array;
2253	}
2254
2255	if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
2256		if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
2257			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2258			atomic64_inc(&cntr->validation_drop_cnt);
2259			dev_err(hdev->dev,
2260				"Queue index %d is invalid\n", q_idx);
2261			rc = -EINVAL;
2262			goto free_cs_chunk_array;
2263		}
2264
2265		if (!hdev->nic_ports_mask) {
2266			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2267			atomic64_inc(&cntr->validation_drop_cnt);
2268			dev_err(hdev->dev,
2269				"Collective operations not supported when NIC ports are disabled");
2270			rc = -EINVAL;
2271			goto free_cs_chunk_array;
2272		}
2273
2274		collective_engine_id = chunk->collective_engine_id;
2275	}
2276
2277	is_wait_cs = !!(cs_type == CS_TYPE_WAIT ||
2278			cs_type == CS_TYPE_COLLECTIVE_WAIT);
2279
2280	cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
2281
2282	if (is_wait_cs) {
2283		rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq,
2284				ctx, cs_encaps_signals);
2285		if (rc)
2286			goto free_cs_chunk_array;
2287
2288		if (cs_encaps_signals) {
2289			/* check if cs sequence has encapsulated
2290			 * signals handle
2291			 */
2292			struct idr *idp;
2293			u32 id;
2294
2295			spin_lock(&ctx->sig_mgr.lock);
2296			idp = &ctx->sig_mgr.handles;
2297			idr_for_each_entry(idp, encaps_sig_hdl, id) {
2298				if (encaps_sig_hdl->cs_seq == signal_seq) {
2299					/* get refcount to protect removing this handle from idr,
2300					 * needed when multiple wait cs are used with offset
2301					 * to wait on reserved encaps signals.
2302					 * Since kref_put of this handle is executed outside the
2303					 * current lock, it is possible that the handle refcount
2304					 * is 0 but it yet to be removed from the list. In this
2305					 * case need to consider the handle as not valid.
2306					 */
2307					if (kref_get_unless_zero(&encaps_sig_hdl->refcount))
2308						handle_found = true;
2309					break;
2310				}
2311			}
2312			spin_unlock(&ctx->sig_mgr.lock);
2313
2314			if (!handle_found) {
2315				/* treat as signal CS already finished */
2316				dev_dbg(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n",
2317						signal_seq);
2318				rc = 0;
2319				goto free_cs_chunk_array;
2320			}
2321
2322			/* validate also the signal offset value */
2323			if (chunk->encaps_signal_offset >
2324					encaps_sig_hdl->count) {
2325				dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n",
2326						chunk->encaps_signal_offset,
2327						encaps_sig_hdl->count);
2328				rc = -EINVAL;
2329				goto free_cs_chunk_array;
2330			}
2331		}
2332
2333		sig_fence = hl_ctx_get_fence(ctx, signal_seq);
2334		if (IS_ERR(sig_fence)) {
2335			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2336			atomic64_inc(&cntr->validation_drop_cnt);
2337			dev_err(hdev->dev,
2338				"Failed to get signal CS with seq 0x%llx\n",
2339				signal_seq);
2340			rc = PTR_ERR(sig_fence);
2341			goto free_cs_chunk_array;
2342		}
2343
2344		if (!sig_fence) {
2345			/* signal CS already finished */
2346			rc = 0;
2347			goto free_cs_chunk_array;
2348		}
2349
2350		sig_waitcs_cmpl =
2351			container_of(sig_fence, struct hl_cs_compl, base_fence);
2352
2353		staged_cs_with_encaps_signals = !!
2354				(sig_waitcs_cmpl->type == CS_TYPE_DEFAULT &&
2355				(flags & HL_CS_FLAGS_ENCAP_SIGNALS));
2356
2357		if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL &&
2358				!staged_cs_with_encaps_signals) {
2359			atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2360			atomic64_inc(&cntr->validation_drop_cnt);
2361			dev_err(hdev->dev,
2362				"CS seq 0x%llx is not of a signal/encaps-signal CS\n",
2363				signal_seq);
2364			hl_fence_put(sig_fence);
2365			rc = -EINVAL;
2366			goto free_cs_chunk_array;
2367		}
2368
2369		if (completion_done(&sig_fence->completion)) {
2370			/* signal CS already finished */
2371			hl_fence_put(sig_fence);
2372			rc = 0;
2373			goto free_cs_chunk_array;
2374		}
2375	}
2376
2377	rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, &cs, flags, timeout);
2378	if (rc) {
2379		if (is_wait_cs)
2380			hl_fence_put(sig_fence);
2381
2382		goto free_cs_chunk_array;
2383	}
2384
2385	/*
2386	 * Save the signal CS fence for later initialization right before
2387	 * hanging the wait CS on the queue.
2388	 * for encaps signals case, we save the cs sequence and handle pointer
2389	 * for later initialization.
2390	 */
2391	if (is_wait_cs) {
2392		cs->signal_fence = sig_fence;
2393		/* store the handle pointer, so we don't have to
2394		 * look for it again, later on the flow
2395		 * when we need to set SOB info in hw_queue.
2396		 */
2397		if (cs->encaps_signals)
2398			cs->encaps_sig_hdl = encaps_sig_hdl;
2399	}
2400
2401	hl_debugfs_add_cs(cs);
2402
2403	*cs_seq = cs->sequence;
2404
2405	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL)
2406		rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
2407				q_idx, chunk->encaps_signal_offset);
2408	else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
2409		rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
2410				cs, q_idx, collective_engine_id,
2411				chunk->encaps_signal_offset);
2412	else {
2413		atomic64_inc(&ctx->cs_counters.validation_drop_cnt);
2414		atomic64_inc(&cntr->validation_drop_cnt);
2415		rc = -EINVAL;
2416	}
2417
2418	if (rc)
2419		goto free_cs_object;
2420
2421	if (q_type == QUEUE_TYPE_HW)
2422		INIT_WORK(&cs->finish_work, cs_completion);
2423
2424	rc = hl_hw_queue_schedule_cs(cs);
2425	if (rc) {
2426		/* In case wait cs failed here, it means the signal cs
2427		 * already completed. we want to free all it's related objects
2428		 * but we don't want to fail the ioctl.
2429		 */
2430		if (is_wait_cs)
2431			rc = 0;
2432		else if (rc != -EAGAIN)
2433			dev_err(hdev->dev,
2434				"Failed to submit CS %d.%llu to H/W queues, error %d\n",
2435				ctx->asid, cs->sequence, rc);
2436		goto free_cs_object;
2437	}
2438
2439	*signal_sob_addr_offset = cs->sob_addr_offset;
2440	*signal_initial_sob_count = cs->initial_sob_count;
2441
2442	rc = HL_CS_STATUS_SUCCESS;
2443	if (is_wait_cs)
2444		wait_cs_submitted = true;
2445	goto put_cs;
2446
2447free_cs_object:
2448	cs_rollback(hdev, cs);
2449	*cs_seq = ULLONG_MAX;
2450	/* The path below is both for good and erroneous exits */
2451put_cs:
2452	/* We finished with the CS in this function, so put the ref */
2453	cs_put(cs);
2454free_cs_chunk_array:
2455	if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
2456		kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx);
2457	kfree(cs_chunk_array);
2458out:
2459	return rc;
2460}
2461
2462static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
2463						u32 num_engine_cores, u32 core_command)
2464{
2465	struct hl_device *hdev = hpriv->hdev;
2466	void __user *engine_cores_arr;
2467	u32 *cores;
2468	int rc;
2469
2470	if (!hdev->asic_prop.supports_engine_modes)
2471		return -EPERM;
2472
2473	if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) {
2474		dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores);
2475		return -EINVAL;
2476	}
2477
2478	if (core_command != HL_ENGINE_CORE_RUN && core_command != HL_ENGINE_CORE_HALT) {
2479		dev_err(hdev->dev, "Engine core command is invalid\n");
2480		return -EINVAL;
2481	}
2482
2483	engine_cores_arr = (void __user *) (uintptr_t) engine_cores;
2484	cores = kmalloc_array(num_engine_cores, sizeof(u32), GFP_KERNEL);
2485	if (!cores)
2486		return -ENOMEM;
2487
2488	if (copy_from_user(cores, engine_cores_arr, num_engine_cores * sizeof(u32))) {
2489		dev_err(hdev->dev, "Failed to copy core-ids array from user\n");
2490		kfree(cores);
2491		return -EFAULT;
2492	}
2493
2494	rc = hdev->asic_funcs->set_engine_cores(hdev, cores, num_engine_cores, core_command);
2495	kfree(cores);
2496
2497	return rc;
2498}
2499
2500static int cs_ioctl_engines(struct hl_fpriv *hpriv, u64 engines_arr_user_addr,
2501						u32 num_engines, enum hl_engine_command command)
2502{
2503	struct hl_device *hdev = hpriv->hdev;
2504	u32 *engines, max_num_of_engines;
2505	void __user *engines_arr;
2506	int rc;
2507
2508	if (!hdev->asic_prop.supports_engine_modes)
2509		return -EPERM;
2510
2511	if (command >= HL_ENGINE_COMMAND_MAX) {
2512		dev_err(hdev->dev, "Engine command is invalid\n");
2513		return -EINVAL;
2514	}
2515
2516	max_num_of_engines = hdev->asic_prop.max_num_of_engines;
2517	if (command == HL_ENGINE_CORE_RUN || command == HL_ENGINE_CORE_HALT)
2518		max_num_of_engines = hdev->asic_prop.num_engine_cores;
2519
2520	if (!num_engines || num_engines > max_num_of_engines) {
2521		dev_err(hdev->dev, "Number of engines %d is invalid\n", num_engines);
2522		return -EINVAL;
2523	}
2524
2525	engines_arr = (void __user *) (uintptr_t) engines_arr_user_addr;
2526	engines = kmalloc_array(num_engines, sizeof(u32), GFP_KERNEL);
2527	if (!engines)
2528		return -ENOMEM;
2529
2530	if (copy_from_user(engines, engines_arr, num_engines * sizeof(u32))) {
2531		dev_err(hdev->dev, "Failed to copy engine-ids array from user\n");
2532		kfree(engines);
2533		return -EFAULT;
2534	}
2535
2536	rc = hdev->asic_funcs->set_engines(hdev, engines, num_engines, command);
2537	kfree(engines);
2538
2539	return rc;
2540}
2541
2542static int cs_ioctl_flush_pci_hbw_writes(struct hl_fpriv *hpriv)
2543{
2544	struct hl_device *hdev = hpriv->hdev;
2545	struct asic_fixed_properties *prop = &hdev->asic_prop;
2546
2547	if (!prop->hbw_flush_reg) {
2548		dev_dbg(hdev->dev, "HBW flush is not supported\n");
2549		return -EOPNOTSUPP;
2550	}
2551
2552	RREG32(prop->hbw_flush_reg);
2553
2554	return 0;
2555}
2556
2557int hl_cs_ioctl(struct drm_device *ddev, void *data, struct drm_file *file_priv)
2558{
2559	struct hl_fpriv *hpriv = file_priv->driver_priv;
2560	union hl_cs_args *args = data;
2561	enum hl_cs_type cs_type = 0;
2562	u64 cs_seq = ULONG_MAX;
2563	void __user *chunks;
2564	u32 num_chunks, flags, timeout,
2565		signals_count = 0, sob_addr = 0, handle_id = 0;
2566	u16 sob_initial_count = 0;
2567	int rc;
2568
2569	rc = hl_cs_sanity_checks(hpriv, args);
2570	if (rc)
2571		goto out;
2572
2573	rc = hl_cs_ctx_switch(hpriv, args, &cs_seq);
2574	if (rc)
2575		goto out;
2576
2577	cs_type = hl_cs_get_cs_type(args->in.cs_flags &
2578					~HL_CS_FLAGS_FORCE_RESTORE);
2579	chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
2580	num_chunks = args->in.num_chunks_execute;
2581	flags = args->in.cs_flags;
2582
2583	/* In case this is a staged CS, user should supply the CS sequence */
2584	if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
2585			!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
2586		cs_seq = args->in.seq;
2587
2588	timeout = flags & HL_CS_FLAGS_CUSTOM_TIMEOUT
2589			? msecs_to_jiffies(args->in.timeout * 1000)
2590			: hpriv->hdev->timeout_jiffies;
2591
2592	switch (cs_type) {
2593	case CS_TYPE_SIGNAL:
2594	case CS_TYPE_WAIT:
2595	case CS_TYPE_COLLECTIVE_WAIT:
2596		rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
2597					&cs_seq, args->in.cs_flags, timeout,
2598					&sob_addr, &sob_initial_count);
2599		break;
2600	case CS_RESERVE_SIGNALS:
2601		rc = cs_ioctl_reserve_signals(hpriv,
2602					args->in.encaps_signals_q_idx,
2603					args->in.encaps_signals_count,
2604					&handle_id, &sob_addr, &signals_count);
2605		break;
2606	case CS_UNRESERVE_SIGNALS:
2607		rc = cs_ioctl_unreserve_signals(hpriv,
2608					args->in.encaps_sig_handle_id);
2609		break;
2610	case CS_TYPE_ENGINE_CORE:
2611		rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores,
2612				args->in.num_engine_cores, args->in.core_command);
2613		break;
2614	case CS_TYPE_ENGINES:
2615		rc = cs_ioctl_engines(hpriv, args->in.engines,
2616				args->in.num_engines, args->in.engine_command);
2617		break;
2618	case CS_TYPE_FLUSH_PCI_HBW_WRITES:
2619		rc = cs_ioctl_flush_pci_hbw_writes(hpriv);
2620		break;
2621	default:
2622		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
2623						args->in.cs_flags,
2624						args->in.encaps_sig_handle_id,
2625						timeout, &sob_initial_count);
2626		break;
2627	}
2628out:
2629	if (rc != -EAGAIN) {
2630		memset(args, 0, sizeof(*args));
2631
2632		switch (cs_type) {
2633		case CS_RESERVE_SIGNALS:
2634			args->out.handle_id = handle_id;
2635			args->out.sob_base_addr_offset = sob_addr;
2636			args->out.count = signals_count;
2637			break;
2638		case CS_TYPE_SIGNAL:
2639			args->out.sob_base_addr_offset = sob_addr;
2640			args->out.sob_count_before_submission = sob_initial_count;
2641			args->out.seq = cs_seq;
2642			break;
2643		case CS_TYPE_DEFAULT:
2644			args->out.sob_count_before_submission = sob_initial_count;
2645			args->out.seq = cs_seq;
2646			break;
2647		default:
2648			args->out.seq = cs_seq;
2649			break;
2650		}
2651
2652		args->out.status = rc;
2653	}
2654
2655	return rc;
2656}
2657
2658static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence,
2659				enum hl_cs_wait_status *status, u64 timeout_us, s64 *timestamp)
2660{
2661	struct hl_device *hdev = ctx->hdev;
2662	ktime_t timestamp_kt;
2663	long completion_rc;
2664	int rc = 0, error;
2665
2666	if (IS_ERR(fence)) {
2667		rc = PTR_ERR(fence);
2668		if (rc == -EINVAL)
2669			dev_notice_ratelimited(hdev->dev,
2670				"Can't wait on CS %llu because current CS is at seq %llu\n",
2671				seq, ctx->cs_sequence);
2672		return rc;
2673	}
2674
2675	if (!fence) {
2676		if (!hl_pop_cs_outcome(&ctx->outcome_store, seq, &timestamp_kt, &error)) {
2677			dev_dbg(hdev->dev,
2678				"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
2679				seq, ctx->cs_sequence);
2680			*status = CS_WAIT_STATUS_GONE;
2681			return 0;
2682		}
2683
2684		completion_rc = 1;
2685		goto report_results;
2686	}
2687
2688	if (!timeout_us) {
2689		completion_rc = completion_done(&fence->completion);
2690	} else {
2691		unsigned long timeout;
2692
2693		timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ?
2694				timeout_us : usecs_to_jiffies(timeout_us);
2695		completion_rc =
2696			wait_for_completion_interruptible_timeout(
2697				&fence->completion, timeout);
2698	}
2699
2700	error = fence->error;
2701	timestamp_kt = fence->timestamp;
2702
2703report_results:
2704	if (completion_rc > 0) {
2705		*status = CS_WAIT_STATUS_COMPLETED;
2706		if (timestamp)
2707			*timestamp = ktime_to_ns(timestamp_kt);
2708	} else {
2709		*status = CS_WAIT_STATUS_BUSY;
2710	}
2711
2712	if (completion_rc == -ERESTARTSYS)
2713		rc = completion_rc;
2714	else if (error == -ETIMEDOUT || error == -EIO)
2715		rc = error;
2716
2717	return rc;
2718}
2719
2720/*
2721 * hl_cs_poll_fences - iterate CS fences to check for CS completion
2722 *
2723 * @mcs_data: multi-CS internal data
2724 * @mcs_compl: multi-CS completion structure
2725 *
2726 * @return 0 on success, otherwise non 0 error code
2727 *
2728 * The function iterates on all CS sequence in the list and set bit in
2729 * completion_bitmap for each completed CS.
2730 * While iterating, the function sets the stream map of each fence in the fence
2731 * array in the completion QID stream map to be used by CSs to perform
2732 * completion to the multi-CS context.
2733 * This function shall be called after taking context ref
2734 */
2735static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_completion *mcs_compl)
2736{
2737	struct hl_fence **fence_ptr = mcs_data->fence_arr;
2738	struct hl_device *hdev = mcs_data->ctx->hdev;
2739	int i, rc, arr_len = mcs_data->arr_len;
2740	u64 *seq_arr = mcs_data->seq_arr;
2741	ktime_t max_ktime, first_cs_time;
2742	enum hl_cs_wait_status status;
2743
2744	memset(fence_ptr, 0, arr_len * sizeof(struct hl_fence *));
2745
2746	/* get all fences under the same lock */
2747	rc = hl_ctx_get_fences(mcs_data->ctx, seq_arr, fence_ptr, arr_len);
2748	if (rc)
2749		return rc;
2750
2751	/*
2752	 * re-initialize the completion here to handle 2 possible cases:
2753	 * 1. CS will complete the multi-CS prior clearing the completion. in which
2754	 *    case the fence iteration is guaranteed to catch the CS completion.
2755	 * 2. the completion will occur after re-init of the completion.
2756	 *    in which case we will wake up immediately in wait_for_completion.
2757	 */
2758	reinit_completion(&mcs_compl->completion);
2759
2760	/*
2761	 * set to maximum time to verify timestamp is valid: if at the end
2762	 * this value is maintained- no timestamp was updated
2763	 */
2764	max_ktime = ktime_set(KTIME_SEC_MAX, 0);
2765	first_cs_time = max_ktime;
2766
2767	for (i = 0; i < arr_len; i++, fence_ptr++) {
2768		struct hl_fence *fence = *fence_ptr;
2769
2770		/*
2771		 * In order to prevent case where we wait until timeout even though a CS associated
2772		 * with the multi-CS actually completed we do things in the below order:
2773		 * 1. for each fence set it's QID map in the multi-CS completion QID map. This way
2774		 *    any CS can, potentially, complete the multi CS for the specific QID (note
2775		 *    that once completion is initialized, calling complete* and then wait on the
2776		 *    completion will cause it to return at once)
2777		 * 2. only after allowing multi-CS completion for the specific QID we check whether
2778		 *    the specific CS already completed (and thus the wait for completion part will
2779		 *    be skipped). if the CS not completed it is guaranteed that completing CS will
2780		 *    wake up the completion.
2781		 */
2782		if (fence)
2783			mcs_compl->stream_master_qid_map |= fence->stream_master_qid_map;
2784
2785		/*
2786		 * function won't sleep as it is called with timeout 0 (i.e.
2787		 * poll the fence)
2788		 */
2789		rc = hl_wait_for_fence(mcs_data->ctx, seq_arr[i], fence, &status, 0, NULL);
2790		if (rc) {
2791			dev_err(hdev->dev,
2792				"wait_for_fence error :%d for CS seq %llu\n",
2793								rc, seq_arr[i]);
2794			break;
2795		}
2796
2797		switch (status) {
2798		case CS_WAIT_STATUS_BUSY:
2799			/* CS did not finished, QID to wait on already stored */
2800			break;
2801		case CS_WAIT_STATUS_COMPLETED:
2802			/*
2803			 * Using mcs_handling_done to avoid possibility of mcs_data
2804			 * returns to user indicating CS completed before it finished
2805			 * all of its mcs handling, to avoid race the next time the
2806			 * user waits for mcs.
2807			 * note: when reaching this case fence is definitely not NULL
2808			 *       but NULL check was added to overcome static analysis
2809			 */
2810			if (fence && !fence->mcs_handling_done) {
2811				/*
2812				 * in case multi CS is completed but MCS handling not done
2813				 * we "complete" the multi CS to prevent it from waiting
2814				 * until time-out and the "multi-CS handling done" will have
2815				 * another chance at the next iteration
2816				 */
2817				complete_all(&mcs_compl->completion);
2818				break;
2819			}
2820
2821			mcs_data->completion_bitmap |= BIT(i);
2822			/*
2823			 * For all completed CSs we take the earliest timestamp.
2824			 * For this we have to validate that the timestamp is
2825			 * earliest of all timestamps so far.
2826			 */
2827			if (fence && mcs_data->update_ts &&
2828					(ktime_compare(fence->timestamp, first_cs_time) < 0))
2829				first_cs_time = fence->timestamp;
2830			break;
2831		case CS_WAIT_STATUS_GONE:
2832			mcs_data->update_ts = false;
2833			mcs_data->gone_cs = true;
2834			/*
2835			 * It is possible to get an old sequence numbers from user
2836			 * which related to already completed CSs and their fences
2837			 * already gone. In this case, CS set as completed but
2838			 * no need to consider its QID for mcs completion.
2839			 */
2840			mcs_data->completion_bitmap |= BIT(i);
2841			break;
2842		default:
2843			dev_err(hdev->dev, "Invalid fence status\n");
2844			rc = -EINVAL;
2845			break;
2846		}
2847
2848	}
2849
2850	hl_fences_put(mcs_data->fence_arr, arr_len);
2851
2852	if (mcs_data->update_ts &&
2853			(ktime_compare(first_cs_time, max_ktime) != 0))
2854		mcs_data->timestamp = ktime_to_ns(first_cs_time);
2855
2856	return rc;
2857}
2858
2859static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq,
2860				enum hl_cs_wait_status *status, s64 *timestamp)
2861{
2862	struct hl_fence *fence;
2863	int rc = 0;
2864
2865	if (timestamp)
2866		*timestamp = 0;
2867
2868	hl_ctx_get(ctx);
2869
2870	fence = hl_ctx_get_fence(ctx, seq);
2871
2872	rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp);
2873	hl_fence_put(fence);
2874	hl_ctx_put(ctx);
2875
2876	return rc;
2877}
2878
2879static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs)
2880{
2881	if (usecs <= U32_MAX)
2882		return usecs_to_jiffies(usecs);
2883
2884	/*
2885	 * If the value in nanoseconds is larger than 64 bit, use the largest
2886	 * 64 bit value.
2887	 */
2888	if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC)))
2889		return nsecs_to_jiffies(U64_MAX);
2890
2891	return nsecs_to_jiffies(usecs * NSEC_PER_USEC);
2892}
2893
2894/*
2895 * hl_wait_multi_cs_completion_init - init completion structure
2896 *
2897 * @hdev: pointer to habanalabs device structure
2898 * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
2899 *                        master QID to wait on
2900 *
2901 * @return valid completion struct pointer on success, otherwise error pointer
2902 *
2903 * up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver.
2904 * the function gets the first available completion (by marking it "used")
2905 * and initialize its values.
2906 */
2907static struct multi_cs_completion *hl_wait_multi_cs_completion_init(struct hl_device *hdev)
2908{
2909	struct multi_cs_completion *mcs_compl;
2910	int i;
2911
2912	/* find free multi_cs completion structure */
2913	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
2914		mcs_compl = &hdev->multi_cs_completion[i];
2915		spin_lock(&mcs_compl->lock);
2916		if (!mcs_compl->used) {
2917			mcs_compl->used = 1;
2918			mcs_compl->timestamp = 0;
2919			/*
2920			 * init QID map to 0 to avoid completion by CSs. the actual QID map
2921			 * to multi-CS CSs will be set incrementally at a later stage
2922			 */
2923			mcs_compl->stream_master_qid_map = 0;
2924			spin_unlock(&mcs_compl->lock);
2925			break;
2926		}
2927		spin_unlock(&mcs_compl->lock);
2928	}
2929
2930	if (i == MULTI_CS_MAX_USER_CTX) {
2931		dev_err(hdev->dev, "no available multi-CS completion structure\n");
2932		return ERR_PTR(-ENOMEM);
2933	}
2934	return mcs_compl;
2935}
2936
2937/*
2938 * hl_wait_multi_cs_completion_fini - return completion structure and set as
2939 *                                    unused
2940 *
2941 * @mcs_compl: pointer to the completion structure
2942 */
2943static void hl_wait_multi_cs_completion_fini(
2944					struct multi_cs_completion *mcs_compl)
2945{
2946	/*
2947	 * free completion structure, do it under lock to be in-sync with the
2948	 * thread that signals completion
2949	 */
2950	spin_lock(&mcs_compl->lock);
2951	mcs_compl->used = 0;
2952	spin_unlock(&mcs_compl->lock);
2953}
2954
2955/*
2956 * hl_wait_multi_cs_completion - wait for first CS to complete
2957 *
2958 * @mcs_data: multi-CS internal data
2959 *
2960 * @return 0 on success, otherwise non 0 error code
2961 */
2962static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data,
2963						struct multi_cs_completion *mcs_compl)
2964{
2965	long completion_rc;
2966
2967	completion_rc = wait_for_completion_interruptible_timeout(&mcs_compl->completion,
2968									mcs_data->timeout_jiffies);
2969
2970	/* update timestamp */
2971	if (completion_rc > 0)
2972		mcs_data->timestamp = mcs_compl->timestamp;
2973
2974	if (completion_rc == -ERESTARTSYS)
2975		return completion_rc;
2976
2977	mcs_data->wait_status = completion_rc;
2978
2979	return 0;
2980}
2981
2982/*
2983 * hl_multi_cs_completion_init - init array of multi-CS completion structures
2984 *
2985 * @hdev: pointer to habanalabs device structure
2986 */
2987void hl_multi_cs_completion_init(struct hl_device *hdev)
2988{
2989	struct multi_cs_completion *mcs_cmpl;
2990	int i;
2991
2992	for (i = 0; i < MULTI_CS_MAX_USER_CTX; i++) {
2993		mcs_cmpl = &hdev->multi_cs_completion[i];
2994		mcs_cmpl->used = 0;
2995		spin_lock_init(&mcs_cmpl->lock);
2996		init_completion(&mcs_cmpl->completion);
2997	}
2998}
2999
3000/*
3001 * hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl
3002 *
3003 * @hpriv: pointer to the private data of the fd
3004 * @data: pointer to multi-CS wait ioctl in/out args
3005 *
3006 */
3007static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3008{
3009	struct multi_cs_completion *mcs_compl;
3010	struct hl_device *hdev = hpriv->hdev;
3011	struct multi_cs_data mcs_data = {};
3012	union hl_wait_cs_args *args = data;
3013	struct hl_ctx *ctx = hpriv->ctx;
3014	struct hl_fence **fence_arr;
3015	void __user *seq_arr;
3016	u32 size_to_copy;
3017	u64 *cs_seq_arr;
3018	u8 seq_arr_len;
3019	int rc, i;
3020
3021	for (i = 0 ; i < sizeof(args->in.pad) ; i++)
3022		if (args->in.pad[i]) {
3023			dev_dbg(hdev->dev, "Padding bytes must be 0\n");
3024			return -EINVAL;
3025		}
3026
3027	if (!hdev->supports_wait_for_multi_cs) {
3028		dev_err(hdev->dev, "Wait for multi CS is not supported\n");
3029		return -EPERM;
3030	}
3031
3032	seq_arr_len = args->in.seq_arr_len;
3033
3034	if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) {
3035		dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n",
3036				HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len);
3037		return -EINVAL;
3038	}
3039
3040	/* allocate memory for sequence array */
3041	cs_seq_arr =
3042		kmalloc_array(seq_arr_len, sizeof(*cs_seq_arr), GFP_KERNEL);
3043	if (!cs_seq_arr)
3044		return -ENOMEM;
3045
3046	/* copy CS sequence array from user */
3047	seq_arr = (void __user *) (uintptr_t) args->in.seq;
3048	size_to_copy = seq_arr_len * sizeof(*cs_seq_arr);
3049	if (copy_from_user(cs_seq_arr, seq_arr, size_to_copy)) {
3050		dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n");
3051		rc = -EFAULT;
3052		goto free_seq_arr;
3053	}
3054
3055	/* allocate array for the fences */
3056	fence_arr = kmalloc_array(seq_arr_len, sizeof(struct hl_fence *), GFP_KERNEL);
3057	if (!fence_arr) {
3058		rc = -ENOMEM;
3059		goto free_seq_arr;
3060	}
3061
3062	/* initialize the multi-CS internal data */
3063	mcs_data.ctx = ctx;
3064	mcs_data.seq_arr = cs_seq_arr;
3065	mcs_data.fence_arr = fence_arr;
3066	mcs_data.arr_len = seq_arr_len;
3067
3068	hl_ctx_get(ctx);
3069
3070	/* wait (with timeout) for the first CS to be completed */
3071	mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us);
3072	mcs_compl = hl_wait_multi_cs_completion_init(hdev);
3073	if (IS_ERR(mcs_compl)) {
3074		rc = PTR_ERR(mcs_compl);
3075		goto put_ctx;
3076	}
3077
3078	/* poll all CS fences, extract timestamp */
3079	mcs_data.update_ts = true;
3080	rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
3081	/*
3082	 * skip wait for CS completion when one of the below is true:
3083	 * - an error on the poll function
3084	 * - one or more CS in the list completed
3085	 * - the user called ioctl with timeout 0
3086	 */
3087	if (rc || mcs_data.completion_bitmap || !args->in.timeout_us)
3088		goto completion_fini;
3089
3090	while (true) {
3091		rc = hl_wait_multi_cs_completion(&mcs_data, mcs_compl);
3092		if (rc || (mcs_data.wait_status == 0))
3093			break;
3094
3095		/*
3096		 * poll fences once again to update the CS map.
3097		 * no timestamp should be updated this time.
3098		 */
3099		mcs_data.update_ts = false;
3100		rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
3101
3102		if (rc || mcs_data.completion_bitmap)
3103			break;
3104
3105		/*
3106		 * if hl_wait_multi_cs_completion returned before timeout (i.e.
3107		 * it got a completion) it either got completed by CS in the multi CS list
3108		 * (in which case the indication will be non empty completion_bitmap) or it
3109		 * got completed by CS submitted to one of the shared stream master but
3110		 * not in the multi CS list (in which case we should wait again but modify
3111		 * the timeout and set timestamp as zero to let a CS related to the current
3112		 * multi-CS set a new, relevant, timestamp)
3113		 */
3114		mcs_data.timeout_jiffies = mcs_data.wait_status;
3115		mcs_compl->timestamp = 0;
3116	}
3117
3118completion_fini:
3119	hl_wait_multi_cs_completion_fini(mcs_compl);
3120
3121put_ctx:
3122	hl_ctx_put(ctx);
3123	kfree(fence_arr);
3124
3125free_seq_arr:
3126	kfree(cs_seq_arr);
3127
3128	if (rc == -ERESTARTSYS) {
3129		dev_err_ratelimited(hdev->dev,
3130				"user process got signal while waiting for Multi-CS\n");
3131		rc = -EINTR;
3132	}
3133
3134	if (rc)
3135		return rc;
3136
3137	/* update output args */
3138	memset(args, 0, sizeof(*args));
3139
3140	if (mcs_data.completion_bitmap) {
3141		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
3142		args->out.cs_completion_map = mcs_data.completion_bitmap;
3143
3144		/* if timestamp not 0- it's valid */
3145		if (mcs_data.timestamp) {
3146			args->out.timestamp_nsec = mcs_data.timestamp;
3147			args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3148		}
3149
3150		/* update if some CS was gone */
3151		if (!mcs_data.timestamp)
3152			args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
3153	} else {
3154		args->out.status = HL_WAIT_CS_STATUS_BUSY;
3155	}
3156
3157	return 0;
3158}
3159
3160static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3161{
3162	struct hl_device *hdev = hpriv->hdev;
3163	union hl_wait_cs_args *args = data;
3164	enum hl_cs_wait_status status;
3165	u64 seq = args->in.seq;
3166	s64 timestamp;
3167	int rc;
3168
3169	rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq, &status, &timestamp);
3170
3171	if (rc == -ERESTARTSYS) {
3172		dev_err_ratelimited(hdev->dev,
3173			"user process got signal while waiting for CS handle %llu\n",
3174			seq);
3175		return -EINTR;
3176	}
3177
3178	memset(args, 0, sizeof(*args));
3179
3180	if (rc) {
3181		if (rc == -ETIMEDOUT) {
3182			dev_err_ratelimited(hdev->dev,
3183				"CS %llu has timed-out while user process is waiting for it\n",
3184				seq);
3185			args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
3186		} else if (rc == -EIO) {
3187			dev_err_ratelimited(hdev->dev,
3188				"CS %llu has been aborted while user process is waiting for it\n",
3189				seq);
3190			args->out.status = HL_WAIT_CS_STATUS_ABORTED;
3191		}
3192		return rc;
3193	}
3194
3195	if (timestamp) {
3196		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3197		args->out.timestamp_nsec = timestamp;
3198	}
3199
3200	switch (status) {
3201	case CS_WAIT_STATUS_GONE:
3202		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
3203		fallthrough;
3204	case CS_WAIT_STATUS_COMPLETED:
3205		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
3206		break;
3207	case CS_WAIT_STATUS_BUSY:
3208	default:
3209		args->out.status = HL_WAIT_CS_STATUS_BUSY;
3210		break;
3211	}
3212
3213	return 0;
3214}
3215
3216static inline void set_record_cq_info(struct hl_user_pending_interrupt *record,
3217					struct hl_cb *cq_cb, u32 cq_offset, u32 target_value)
3218{
3219	record->ts_reg_info.cq_cb = cq_cb;
3220	record->cq_kernel_addr = (u64 *) cq_cb->kernel_address + cq_offset;
3221	record->cq_target_value = target_value;
3222}
3223
3224static int validate_and_get_ts_record(struct device *dev,
3225					struct hl_ts_buff *ts_buff, u64 ts_offset,
3226					struct hl_user_pending_interrupt **req_event_record)
3227{
3228	struct hl_user_pending_interrupt *ts_cb_last;
3229
3230	*req_event_record = (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
3231						ts_offset;
3232	ts_cb_last = (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
3233			(ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt));
3234
3235	/* Validate ts_offset not exceeding last max */
3236	if (*req_event_record >= ts_cb_last) {
3237		dev_err(dev, "Ts offset(%llu) exceeds max CB offset(0x%llx)\n",
3238				ts_offset, (u64)(uintptr_t)ts_cb_last);
3239		return -EINVAL;
3240	}
3241
3242	return 0;
3243}
3244
3245static void unregister_timestamp_node(struct hl_device *hdev,
3246			struct hl_user_pending_interrupt *record, bool need_lock)
3247{
3248	struct hl_user_interrupt *interrupt = record->ts_reg_info.interrupt;
3249	bool ts_rec_found = false;
3250	unsigned long flags;
3251
3252	if (need_lock)
3253		spin_lock_irqsave(&interrupt->ts_list_lock, flags);
3254
3255	if (record->ts_reg_info.in_use) {
3256		record->ts_reg_info.in_use = false;
3257		list_del(&record->list_node);
3258		ts_rec_found = true;
3259	}
3260
3261	if (need_lock)
3262		spin_unlock_irqrestore(&interrupt->ts_list_lock, flags);
3263
3264	/* Put refcounts that were taken when we registered the event */
3265	if (ts_rec_found) {
3266		hl_mmap_mem_buf_put(record->ts_reg_info.buf);
3267		hl_cb_put(record->ts_reg_info.cq_cb);
3268	}
3269}
3270
3271static int ts_get_and_handle_kernel_record(struct hl_device *hdev, struct hl_ctx *ctx,
3272					struct wait_interrupt_data *data, unsigned long *flags,
3273					struct hl_user_pending_interrupt **pend)
3274{
3275	struct hl_user_pending_interrupt *req_offset_record;
3276	struct hl_ts_buff *ts_buff = data->buf->private;
3277	bool need_lock = false;
3278	int rc;
3279
3280	rc = validate_and_get_ts_record(data->buf->mmg->dev, ts_buff, data->ts_offset,
3281									&req_offset_record);
3282	if (rc)
3283		return rc;
3284
3285	/* In case the node already registered, need to unregister first then re-use */
3286	if (req_offset_record->ts_reg_info.in_use) {
3287		dev_dbg(data->buf->mmg->dev,
3288				"Requested record %p is in use on irq: %u ts addr: %p, unregister first then put on irq: %u\n",
3289				req_offset_record,
3290				req_offset_record->ts_reg_info.interrupt->interrupt_id,
3291				req_offset_record->ts_reg_info.timestamp_kernel_addr,
3292				data->interrupt->interrupt_id);
3293		/*
3294		 * Since interrupt here can be different than the one the node currently registered
3295		 * on, and we don't want to lock two lists while we're doing unregister, so
3296		 * unlock the new interrupt wait list here and acquire the lock again after you done
3297		 */
3298		if (data->interrupt->interrupt_id !=
3299				req_offset_record->ts_reg_info.interrupt->interrupt_id) {
3300
3301			need_lock = true;
3302			spin_unlock_irqrestore(&data->interrupt->ts_list_lock, *flags);
3303		}
3304
3305		unregister_timestamp_node(hdev, req_offset_record, need_lock);
3306
3307		if (need_lock)
3308			spin_lock_irqsave(&data->interrupt->ts_list_lock, *flags);
3309	}
3310
3311	/* Fill up the new registration node info and add it to the list */
3312	req_offset_record->ts_reg_info.in_use = true;
3313	req_offset_record->ts_reg_info.buf = data->buf;
3314	req_offset_record->ts_reg_info.timestamp_kernel_addr =
3315			(u64 *) ts_buff->user_buff_address + data->ts_offset;
3316	req_offset_record->ts_reg_info.interrupt = data->interrupt;
3317	set_record_cq_info(req_offset_record, data->cq_cb, data->cq_offset,
3318						data->target_value);
3319
3320	*pend = req_offset_record;
3321
3322	return rc;
3323}
3324
3325static int _hl_interrupt_ts_reg_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
3326				struct wait_interrupt_data *data,
3327				u32 *status, u64 *timestamp)
3328{
3329	struct hl_user_pending_interrupt *pend;
3330	unsigned long flags;
3331	int rc = 0;
3332
3333	hl_ctx_get(ctx);
3334
3335	data->cq_cb = hl_cb_get(data->mmg, data->cq_handle);
3336	if (!data->cq_cb) {
3337		rc = -EINVAL;
3338		goto put_ctx;
3339	}
3340
3341	/* Validate the cq offset */
3342	if (((u64 *) data->cq_cb->kernel_address + data->cq_offset) >=
3343			((u64 *) data->cq_cb->kernel_address + (data->cq_cb->size / sizeof(u64)))) {
3344		rc = -EINVAL;
3345		goto put_cq_cb;
3346	}
3347
3348	dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, handle: 0x%llx, ts offset: %llu, cq_offset: %llu\n",
3349					data->interrupt->interrupt_id, data->ts_handle,
3350					data->ts_offset, data->cq_offset);
3351
3352	data->buf = hl_mmap_mem_buf_get(data->mmg, data->ts_handle);
3353	if (!data->buf) {
3354		rc = -EINVAL;
3355		goto put_cq_cb;
3356	}
3357
3358	spin_lock_irqsave(&data->interrupt->ts_list_lock, flags);
3359
3360	/* get ts buffer record */
3361	rc = ts_get_and_handle_kernel_record(hdev, ctx, data, &flags, &pend);
3362	if (rc) {
3363		spin_unlock_irqrestore(&data->interrupt->ts_list_lock, flags);
3364		goto put_ts_buff;
3365	}
3366
3367	/* We check for completion value as interrupt could have been received
3368	 * before we add the timestamp node to the ts list.
3369	 */
3370	if (*pend->cq_kernel_addr >= data->target_value) {
3371		spin_unlock_irqrestore(&data->interrupt->ts_list_lock, flags);
3372
3373		dev_dbg(hdev->dev, "Target value already reached release ts record: pend: %p, offset: %llu, interrupt: %u\n",
3374				pend, data->ts_offset, data->interrupt->interrupt_id);
3375
3376		pend->ts_reg_info.in_use = 0;
3377		*status = HL_WAIT_CS_STATUS_COMPLETED;
3378		*pend->ts_reg_info.timestamp_kernel_addr = ktime_get_ns();
3379
3380		goto put_ts_buff;
3381	}
3382
3383	list_add_tail(&pend->list_node, &data->interrupt->ts_list_head);
3384	spin_unlock_irqrestore(&data->interrupt->ts_list_lock, flags);
3385
3386	rc = *status = HL_WAIT_CS_STATUS_COMPLETED;
3387
3388	hl_ctx_put(ctx);
3389
3390	return rc;
3391
3392put_ts_buff:
3393	hl_mmap_mem_buf_put(data->buf);
3394put_cq_cb:
3395	hl_cb_put(data->cq_cb);
3396put_ctx:
3397	hl_ctx_put(ctx);
3398
3399	return rc;
3400}
3401
3402static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
3403				struct wait_interrupt_data *data,
3404				u32 *status, u64 *timestamp)
3405{
3406	struct hl_user_pending_interrupt *pend;
3407	unsigned long timeout, flags;
3408	long completion_rc;
3409	int rc = 0;
3410
3411	timeout = hl_usecs64_to_jiffies(data->intr_timeout_us);
3412
3413	hl_ctx_get(ctx);
3414
3415	data->cq_cb = hl_cb_get(data->mmg, data->cq_handle);
3416	if (!data->cq_cb) {
3417		rc = -EINVAL;
3418		goto put_ctx;
3419	}
3420
3421	/* Validate the cq offset */
3422	if (((u64 *) data->cq_cb->kernel_address + data->cq_offset) >=
3423			((u64 *) data->cq_cb->kernel_address + (data->cq_cb->size / sizeof(u64)))) {
3424		rc = -EINVAL;
3425		goto put_cq_cb;
3426	}
3427
3428	pend = kzalloc(sizeof(*pend), GFP_KERNEL);
3429	if (!pend) {
3430		rc = -ENOMEM;
3431		goto put_cq_cb;
3432	}
3433
3434	hl_fence_init(&pend->fence, ULONG_MAX);
3435	pend->cq_kernel_addr = (u64 *) data->cq_cb->kernel_address + data->cq_offset;
3436	pend->cq_target_value = data->target_value;
3437	spin_lock_irqsave(&data->interrupt->wait_list_lock, flags);
3438
3439
3440	/* We check for completion value as interrupt could have been received
3441	 * before we add the wait node to the wait list.
3442	 */
3443	if (*pend->cq_kernel_addr >= data->target_value || (!data->intr_timeout_us)) {
3444		spin_unlock_irqrestore(&data->interrupt->wait_list_lock, flags);
3445
3446		if (*pend->cq_kernel_addr >= data->target_value)
3447			*status = HL_WAIT_CS_STATUS_COMPLETED;
3448		else
3449			*status = HL_WAIT_CS_STATUS_BUSY;
3450
3451		pend->fence.timestamp = ktime_get();
3452		goto set_timestamp;
3453	}
3454
3455	/* Add pending user interrupt to relevant list for the interrupt
3456	 * handler to monitor.
3457	 * Note that we cannot have sorted list by target value,
3458	 * in order to shorten the list pass loop, since
3459	 * same list could have nodes for different cq counter handle.
3460	 */
3461	list_add_tail(&pend->list_node, &data->interrupt->wait_list_head);
3462	spin_unlock_irqrestore(&data->interrupt->wait_list_lock, flags);
3463
3464	/* Wait for interrupt handler to signal completion */
3465	completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
3466								timeout);
3467	if (completion_rc > 0) {
3468		if (pend->fence.error == -EIO) {
3469			dev_err_ratelimited(hdev->dev,
3470					"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3471					pend->fence.error);
3472			rc = -EIO;
3473			*status = HL_WAIT_CS_STATUS_ABORTED;
3474		} else {
3475			*status = HL_WAIT_CS_STATUS_COMPLETED;
3476		}
3477	} else {
3478		if (completion_rc == -ERESTARTSYS) {
3479			dev_err_ratelimited(hdev->dev,
3480					"user process got signal while waiting for interrupt ID %d\n",
3481					data->interrupt->interrupt_id);
3482			rc = -EINTR;
3483			*status = HL_WAIT_CS_STATUS_ABORTED;
3484		} else {
3485			/* The wait has timed-out. We don't know anything beyond that
3486			 * because the workload was not submitted through the driver.
3487			 * Therefore, from driver's perspective, the workload is still
3488			 * executing.
3489			 */
3490			rc = 0;
3491			*status = HL_WAIT_CS_STATUS_BUSY;
3492		}
3493	}
3494
3495	/*
3496	 * We keep removing the node from list here, and not at the irq handler
3497	 * for completion timeout case. and if it's a registration
3498	 * for ts record, the node will be deleted in the irq handler after
3499	 * we reach the target value.
3500	 */
3501	spin_lock_irqsave(&data->interrupt->wait_list_lock, flags);
3502	list_del(&pend->list_node);
3503	spin_unlock_irqrestore(&data->interrupt->wait_list_lock, flags);
3504
3505set_timestamp:
3506	*timestamp = ktime_to_ns(pend->fence.timestamp);
3507	kfree(pend);
3508	hl_cb_put(data->cq_cb);
3509	hl_ctx_put(ctx);
3510
3511	return rc;
3512
3513put_cq_cb:
3514	hl_cb_put(data->cq_cb);
3515put_ctx:
3516	hl_ctx_put(ctx);
3517
3518	return rc;
3519}
3520
3521static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_ctx *ctx,
3522				u64 timeout_us, u64 user_address,
3523				u64 target_value, struct hl_user_interrupt *interrupt,
3524				u32 *status,
3525				u64 *timestamp)
3526{
3527	struct hl_user_pending_interrupt *pend;
3528	unsigned long timeout, flags;
3529	u64 completion_value;
3530	long completion_rc;
3531	int rc = 0;
3532
3533	timeout = hl_usecs64_to_jiffies(timeout_us);
3534
3535	hl_ctx_get(ctx);
3536
3537	pend = kzalloc(sizeof(*pend), GFP_KERNEL);
3538	if (!pend) {
3539		hl_ctx_put(ctx);
3540		return -ENOMEM;
3541	}
3542
3543	hl_fence_init(&pend->fence, ULONG_MAX);
3544
3545	/* Add pending user interrupt to relevant list for the interrupt
3546	 * handler to monitor
3547	 */
3548	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3549	list_add_tail(&pend->list_node, &interrupt->wait_list_head);
3550	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3551
3552	/* We check for completion value as interrupt could have been received
3553	 * before we added the node to the wait list
3554	 */
3555	if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
3556		dev_err(hdev->dev, "Failed to copy completion value from user\n");
3557		rc = -EFAULT;
3558		goto remove_pending_user_interrupt;
3559	}
3560
3561	if (completion_value >= target_value) {
3562		*status = HL_WAIT_CS_STATUS_COMPLETED;
3563		/* There was no interrupt, we assume the completion is now. */
3564		pend->fence.timestamp = ktime_get();
3565	} else {
3566		*status = HL_WAIT_CS_STATUS_BUSY;
3567	}
3568
3569	if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
3570		goto remove_pending_user_interrupt;
3571
3572wait_again:
3573	/* Wait for interrupt handler to signal completion */
3574	completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
3575										timeout);
3576
3577	/* If timeout did not expire we need to perform the comparison.
3578	 * If comparison fails, keep waiting until timeout expires
3579	 */
3580	if (completion_rc > 0) {
3581		spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3582		/* reinit_completion must be called before we check for user
3583		 * completion value, otherwise, if interrupt is received after
3584		 * the comparison and before the next wait_for_completion,
3585		 * we will reach timeout and fail
3586		 */
3587		reinit_completion(&pend->fence.completion);
3588		spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3589
3590		if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
3591			dev_err(hdev->dev, "Failed to copy completion value from user\n");
3592			rc = -EFAULT;
3593
3594			goto remove_pending_user_interrupt;
3595		}
3596
3597		if (completion_value >= target_value) {
3598			*status = HL_WAIT_CS_STATUS_COMPLETED;
3599		} else if (pend->fence.error) {
3600			dev_err_ratelimited(hdev->dev,
3601				"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3602				pend->fence.error);
3603			/* set the command completion status as ABORTED */
3604			*status = HL_WAIT_CS_STATUS_ABORTED;
3605		} else {
3606			timeout = completion_rc;
3607			goto wait_again;
3608		}
3609	} else if (completion_rc == -ERESTARTSYS) {
3610		dev_err_ratelimited(hdev->dev,
3611			"user process got signal while waiting for interrupt ID %d\n",
3612			interrupt->interrupt_id);
3613		rc = -EINTR;
3614	} else {
3615		/* The wait has timed-out. We don't know anything beyond that
3616		 * because the workload wasn't submitted through the driver.
3617		 * Therefore, from driver's perspective, the workload is still
3618		 * executing.
3619		 */
3620		rc = 0;
3621		*status = HL_WAIT_CS_STATUS_BUSY;
3622	}
3623
3624remove_pending_user_interrupt:
3625	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3626	list_del(&pend->list_node);
3627	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
3628
3629	*timestamp = ktime_to_ns(pend->fence.timestamp);
3630
3631	kfree(pend);
3632	hl_ctx_put(ctx);
3633
3634	return rc;
3635}
3636
3637static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
3638{
3639	u16 interrupt_id, first_interrupt, last_interrupt;
3640	struct hl_device *hdev = hpriv->hdev;
3641	struct asic_fixed_properties *prop;
3642	struct hl_user_interrupt *interrupt;
3643	union hl_wait_cs_args *args = data;
3644	u32 status = HL_WAIT_CS_STATUS_BUSY;
3645	u64 timestamp = 0;
3646	int rc, int_idx;
3647
3648	prop = &hdev->asic_prop;
3649
3650	if (!(prop->user_interrupt_count + prop->user_dec_intr_count)) {
3651		dev_err(hdev->dev, "no user interrupts allowed");
3652		return -EPERM;
3653	}
3654
3655	interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
3656
3657	first_interrupt = prop->first_available_user_interrupt;
3658	last_interrupt = prop->first_available_user_interrupt + prop->user_interrupt_count - 1;
3659
3660	if (interrupt_id < prop->user_dec_intr_count) {
3661
3662		/* Check if the requested core is enabled */
3663		if (!(prop->decoder_enabled_mask & BIT(interrupt_id))) {
3664			dev_err(hdev->dev, "interrupt on a disabled core(%u) not allowed",
3665				interrupt_id);
3666			return -EINVAL;
3667		}
3668
3669		interrupt = &hdev->user_interrupt[interrupt_id];
3670
3671	} else if (interrupt_id >= first_interrupt && interrupt_id <= last_interrupt) {
3672
3673		int_idx = interrupt_id - first_interrupt + prop->user_dec_intr_count;
3674		interrupt = &hdev->user_interrupt[int_idx];
3675
3676	} else if (interrupt_id == HL_COMMON_USER_CQ_INTERRUPT_ID) {
3677		interrupt = &hdev->common_user_cq_interrupt;
3678	} else if (interrupt_id == HL_COMMON_DEC_INTERRUPT_ID) {
3679		interrupt = &hdev->common_decoder_interrupt;
3680	} else {
3681		dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
3682		return -EINVAL;
3683	}
3684
3685	if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ) {
3686		struct wait_interrupt_data wait_intr_data = {0};
3687
3688		wait_intr_data.interrupt = interrupt;
3689		wait_intr_data.mmg = &hpriv->mem_mgr;
3690		wait_intr_data.cq_handle = args->in.cq_counters_handle;
3691		wait_intr_data.cq_offset = args->in.cq_counters_offset;
3692		wait_intr_data.ts_handle = args->in.timestamp_handle;
3693		wait_intr_data.ts_offset = args->in.timestamp_offset;
3694		wait_intr_data.target_value = args->in.target;
3695		wait_intr_data.intr_timeout_us = args->in.interrupt_timeout_us;
3696
3697		if (args->in.flags & HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT) {
3698			/*
3699			 * Allow only one registration at a time. this is needed in order to prevent
3700			 * issues while handling the flow of re-use of the same offset.
3701			 * Since the registration flow is protected only by the interrupt lock,
3702			 * re-use flow might request to move ts node to another interrupt list,
3703			 * and in such case we're not protected.
3704			 */
3705			mutex_lock(&hpriv->ctx->ts_reg_lock);
3706
3707			rc = _hl_interrupt_ts_reg_ioctl(hdev, hpriv->ctx, &wait_intr_data,
3708						&status, &timestamp);
3709
3710			mutex_unlock(&hpriv->ctx->ts_reg_lock);
3711		} else
3712			rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &wait_intr_data,
3713						&status, &timestamp);
3714	} else {
3715		rc = _hl_interrupt_wait_ioctl_user_addr(hdev, hpriv->ctx,
3716				args->in.interrupt_timeout_us, args->in.addr,
3717				args->in.target, interrupt, &status,
3718				&timestamp);
3719	}
3720
3721	if (rc)
3722		return rc;
3723
3724	memset(args, 0, sizeof(*args));
3725	args->out.status = status;
3726
3727	if (timestamp) {
3728		args->out.timestamp_nsec = timestamp;
3729		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3730	}
3731
3732	return 0;
3733}
3734
3735int hl_wait_ioctl(struct drm_device *ddev, void *data, struct drm_file *file_priv)
3736{
3737	struct hl_fpriv *hpriv = file_priv->driver_priv;
3738	struct hl_device *hdev = hpriv->hdev;
3739	union hl_wait_cs_args *args = data;
3740	u32 flags = args->in.flags;
3741	int rc;
3742
3743	/* If the device is not operational, or if an error has happened and user should release the
3744	 * device, there is no point in waiting for any command submission or user interrupt.
3745	 */
3746	if (!hl_device_operational(hpriv->hdev, NULL) || hdev->reset_info.watchdog_active)
3747		return -EBUSY;
3748
3749	if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
3750		rc = hl_interrupt_wait_ioctl(hpriv, data);
3751	else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS)
3752		rc = hl_multi_cs_wait_ioctl(hpriv, data);
3753	else
3754		rc = hl_cs_wait_ioctl(hpriv, data);
3755
3756	return rc;
3757}
3758