1  // SPDX-License-Identifier: GPL-2.0
2  
3  /*
4   * Copyright 2016-2019 HabanaLabs, Ltd.
5   * All Rights Reserved.
6   */
7  
8  #include "habanalabs.h"
9  
10  #include <linux/slab.h>
11  
12  /*
13   * hl_queue_add_ptr - add to pi or ci and checks if it wraps around
14   *
15   * @ptr: the current pi/ci value
16   * @val: the amount to add
17   *
18   * Add val to ptr. It can go until twice the queue length.
19   */
hl_hw_queue_add_ptr(u32 ptr,u16 val)20  inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val)
21  {
22  	ptr += val;
23  	ptr &= ((HL_QUEUE_LENGTH << 1) - 1);
24  	return ptr;
25  }
queue_ci_get(atomic_t * ci,u32 queue_len)26  static inline int queue_ci_get(atomic_t *ci, u32 queue_len)
27  {
28  	return atomic_read(ci) & ((queue_len << 1) - 1);
29  }
30  
queue_free_slots(struct hl_hw_queue * q,u32 queue_len)31  static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len)
32  {
33  	int delta = (q->pi - queue_ci_get(&q->ci, queue_len));
34  
35  	if (delta >= 0)
36  		return (queue_len - delta);
37  	else
38  		return (abs(delta) - queue_len);
39  }
40  
hl_hw_queue_update_ci(struct hl_cs * cs)41  void hl_hw_queue_update_ci(struct hl_cs *cs)
42  {
43  	struct hl_device *hdev = cs->ctx->hdev;
44  	struct hl_hw_queue *q;
45  	int i;
46  
47  	if (hdev->disabled)
48  		return;
49  
50  	q = &hdev->kernel_queues[0];
51  
52  	/* There are no internal queues if H/W queues are being used */
53  	if (!hdev->asic_prop.max_queues || q->queue_type == QUEUE_TYPE_HW)
54  		return;
55  
56  	/* We must increment CI for every queue that will never get a
57  	 * completion, there are 2 scenarios this can happen:
58  	 * 1. All queues of a non completion CS will never get a completion.
59  	 * 2. Internal queues never gets completion.
60  	 */
61  	for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) {
62  		if (!cs_needs_completion(cs) || q->queue_type == QUEUE_TYPE_INT)
63  			atomic_add(cs->jobs_in_queue_cnt[i], &q->ci);
64  	}
65  }
66  
67  /*
68   * hl_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
69   *                                H/W queue.
70   * @hdev: pointer to habanalabs device structure
71   * @q: pointer to habanalabs queue structure
72   * @ctl: BD's control word
73   * @len: BD's length
74   * @ptr: BD's pointer
75   *
76   * This function assumes there is enough space on the queue to submit a new
77   * BD to it. It initializes the next BD and calls the device specific
78   * function to set the pi (and doorbell)
79   *
80   * This function must be called when the scheduler mutex is taken
81   *
82   */
hl_hw_queue_submit_bd(struct hl_device * hdev,struct hl_hw_queue * q,u32 ctl,u32 len,u64 ptr)83  void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
84  		u32 ctl, u32 len, u64 ptr)
85  {
86  	struct hl_bd *bd;
87  	u64 addr;
88  	int i;
89  
90  	bd = q->kernel_address;
91  	bd += hl_pi_2_offset(q->pi);
92  	bd->ctl = cpu_to_le32(ctl);
93  	bd->len = cpu_to_le32(len);
94  	bd->ptr = cpu_to_le64(ptr);
95  
96  	if (q->dram_bd)
97  		for (i = 0 ; i < 2 ; i++) {
98  			addr = q->pq_dram_address +
99  			((hl_pi_2_offset(q->pi) * sizeof(struct hl_bd))	+ (i * sizeof(u64)));
100  			hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,	addr,
101  						(u64 *)(bd) + i, DEBUGFS_WRITE64);
102  		}
103  
104  	q->pi = hl_queue_inc_ptr(q->pi);
105  
106  	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
107  }
108  
109  /*
110   * ext_queue_sanity_checks - perform some sanity checks on external queue
111   *
112   * @hdev              : pointer to hl_device structure
113   * @q                 :	pointer to hl_hw_queue structure
114   * @num_of_entries    : how many entries to check for space
115   * @reserve_cq_entry  :	whether to reserve an entry in the cq
116   *
117   * H/W queues spinlock should be taken before calling this function
118   *
119   * Perform the following:
120   * - Make sure we have enough space in the h/w queue
121   * - Make sure we have enough space in the completion queue
122   * - Reserve space in the completion queue (needs to be reversed if there
123   *   is a failure down the road before the actual submission of work). Only
124   *   do this action if reserve_cq_entry is true
125   *
126   */
ext_queue_sanity_checks(struct hl_device * hdev,struct hl_hw_queue * q,int num_of_entries,bool reserve_cq_entry)127  static int ext_queue_sanity_checks(struct hl_device *hdev,
128  				struct hl_hw_queue *q, int num_of_entries,
129  				bool reserve_cq_entry)
130  {
131  	atomic_t *free_slots =
132  			&hdev->completion_queue[q->cq_id].free_slots_cnt;
133  	int free_slots_cnt;
134  
135  	/* Check we have enough space in the queue */
136  	free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH);
137  
138  	if (free_slots_cnt < num_of_entries) {
139  		dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
140  			q->hw_queue_id, num_of_entries);
141  		return -EAGAIN;
142  	}
143  
144  	if (reserve_cq_entry) {
145  		/*
146  		 * Check we have enough space in the completion queue
147  		 * Add -1 to counter (decrement) unless counter was already 0
148  		 * In that case, CQ is full so we can't submit a new CB because
149  		 * we won't get ack on its completion
150  		 * atomic_add_unless will return 0 if counter was already 0
151  		 */
152  		if (atomic_add_negative(num_of_entries * -1, free_slots)) {
153  			dev_dbg(hdev->dev, "No space for %d on CQ %d\n",
154  				num_of_entries, q->hw_queue_id);
155  			atomic_add(num_of_entries, free_slots);
156  			return -EAGAIN;
157  		}
158  	}
159  
160  	return 0;
161  }
162  
163  /*
164   * int_queue_sanity_checks - perform some sanity checks on internal queue
165   *
166   * @hdev              : pointer to hl_device structure
167   * @q                 :	pointer to hl_hw_queue structure
168   * @num_of_entries    : how many entries to check for space
169   *
170   * H/W queues spinlock should be taken before calling this function
171   *
172   * Perform the following:
173   * - Make sure we have enough space in the h/w queue
174   *
175   */
int_queue_sanity_checks(struct hl_device * hdev,struct hl_hw_queue * q,int num_of_entries)176  static int int_queue_sanity_checks(struct hl_device *hdev,
177  					struct hl_hw_queue *q,
178  					int num_of_entries)
179  {
180  	int free_slots_cnt;
181  
182  	if (num_of_entries > q->int_queue_len) {
183  		dev_err(hdev->dev,
184  			"Cannot populate queue %u with %u jobs\n",
185  			q->hw_queue_id, num_of_entries);
186  		return -ENOMEM;
187  	}
188  
189  	/* Check we have enough space in the queue */
190  	free_slots_cnt = queue_free_slots(q, q->int_queue_len);
191  
192  	if (free_slots_cnt < num_of_entries) {
193  		dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
194  			q->hw_queue_id, num_of_entries);
195  		return -EAGAIN;
196  	}
197  
198  	return 0;
199  }
200  
201  /*
202   * hw_queue_sanity_checks() - Make sure we have enough space in the h/w queue
203   * @hdev: Pointer to hl_device structure.
204   * @q: Pointer to hl_hw_queue structure.
205   * @num_of_entries: How many entries to check for space.
206   *
207   * Notice: We do not reserve queue entries so this function mustn't be called
208   *         more than once per CS for the same queue
209   *
210   */
hw_queue_sanity_checks(struct hl_device * hdev,struct hl_hw_queue * q,int num_of_entries)211  static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
212  					int num_of_entries)
213  {
214  	int free_slots_cnt;
215  
216  	/* Check we have enough space in the queue */
217  	free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH);
218  
219  	if (free_slots_cnt < num_of_entries) {
220  		dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
221  			q->hw_queue_id, num_of_entries);
222  		return -EAGAIN;
223  	}
224  
225  	return 0;
226  }
227  
228  /*
229   * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
230   *
231   * @hdev: pointer to hl_device structure
232   * @hw_queue_id: Queue's type
233   * @cb_size: size of CB
234   * @cb_ptr: pointer to CB location
235   *
236   * This function sends a single CB, that must NOT generate a completion entry.
237   * Sending CPU messages can be done instead via 'hl_hw_queue_submit_bd()'
238   */
hl_hw_queue_send_cb_no_cmpl(struct hl_device * hdev,u32 hw_queue_id,u32 cb_size,u64 cb_ptr)239  int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
240  				u32 cb_size, u64 cb_ptr)
241  {
242  	struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
243  	int rc = 0;
244  
245  	hdev->asic_funcs->hw_queues_lock(hdev);
246  
247  	if (hdev->disabled) {
248  		rc = -EPERM;
249  		goto out;
250  	}
251  
252  	/*
253  	 * hl_hw_queue_send_cb_no_cmpl() is called for queues of a H/W queue
254  	 * type only on init phase, when the queues are empty and being tested,
255  	 * so there is no need for sanity checks.
256  	 */
257  	if (q->queue_type != QUEUE_TYPE_HW) {
258  		rc = ext_queue_sanity_checks(hdev, q, 1, false);
259  		if (rc)
260  			goto out;
261  	}
262  
263  	hl_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
264  
265  out:
266  	hdev->asic_funcs->hw_queues_unlock(hdev);
267  
268  	return rc;
269  }
270  
271  /*
272   * ext_queue_schedule_job - submit a JOB to an external queue
273   *
274   * @job: pointer to the job that needs to be submitted to the queue
275   *
276   * This function must be called when the scheduler mutex is taken
277   *
278   */
ext_queue_schedule_job(struct hl_cs_job * job)279  static void ext_queue_schedule_job(struct hl_cs_job *job)
280  {
281  	struct hl_device *hdev = job->cs->ctx->hdev;
282  	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
283  	struct hl_cq_entry cq_pkt;
284  	struct hl_cq *cq;
285  	u64 cq_addr;
286  	struct hl_cb *cb;
287  	u32 ctl;
288  	u32 len;
289  	u64 ptr;
290  
291  	/*
292  	 * Update the JOB ID inside the BD CTL so the device would know what
293  	 * to write in the completion queue
294  	 */
295  	ctl = ((q->pi << BD_CTL_SHADOW_INDEX_SHIFT) & BD_CTL_SHADOW_INDEX_MASK);
296  
297  	cb = job->patched_cb;
298  	len = job->job_cb_size;
299  	ptr = cb->bus_address;
300  
301  	/* Skip completion flow in case this is a non completion CS */
302  	if (!cs_needs_completion(job->cs))
303  		goto submit_bd;
304  
305  	cq_pkt.data = cpu_to_le32(
306  			((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
307  				& CQ_ENTRY_SHADOW_INDEX_MASK) |
308  			FIELD_PREP(CQ_ENTRY_SHADOW_INDEX_VALID_MASK, 1) |
309  			FIELD_PREP(CQ_ENTRY_READY_MASK, 1));
310  
311  	/*
312  	 * No need to protect pi_offset because scheduling to the
313  	 * H/W queues is done under the scheduler mutex
314  	 *
315  	 * No need to check if CQ is full because it was already
316  	 * checked in ext_queue_sanity_checks
317  	 */
318  	cq = &hdev->completion_queue[q->cq_id];
319  	cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);
320  
321  	hdev->asic_funcs->add_end_of_cb_packets(hdev, cb->kernel_address, len,
322  						job->user_cb_size,
323  						cq_addr,
324  						le32_to_cpu(cq_pkt.data),
325  						q->msi_vec,
326  						job->contains_dma_pkt);
327  
328  	q->shadow_queue[hl_pi_2_offset(q->pi)] = job;
329  
330  	cq->pi = hl_cq_inc_ptr(cq->pi);
331  
332  submit_bd:
333  	hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
334  }
335  
336  /*
337   * int_queue_schedule_job - submit a JOB to an internal queue
338   *
339   * @job: pointer to the job that needs to be submitted to the queue
340   *
341   * This function must be called when the scheduler mutex is taken
342   *
343   */
int_queue_schedule_job(struct hl_cs_job * job)344  static void int_queue_schedule_job(struct hl_cs_job *job)
345  {
346  	struct hl_device *hdev = job->cs->ctx->hdev;
347  	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
348  	struct hl_bd bd;
349  	__le64 *pi;
350  
351  	bd.ctl = 0;
352  	bd.len = cpu_to_le32(job->job_cb_size);
353  
354  	if (job->is_kernel_allocated_cb)
355  		/* bus_address is actually a mmu mapped address
356  		 * allocated from an internal pool
357  		 */
358  		bd.ptr = cpu_to_le64(job->user_cb->bus_address);
359  	else
360  		bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb);
361  
362  	pi = q->kernel_address + (q->pi & (q->int_queue_len - 1)) * sizeof(bd);
363  
364  	q->pi++;
365  	q->pi &= ((q->int_queue_len << 1) - 1);
366  
367  	hdev->asic_funcs->pqe_write(hdev, pi, &bd);
368  
369  	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
370  }
371  
372  /*
373   * hw_queue_schedule_job - submit a JOB to a H/W queue
374   *
375   * @job: pointer to the job that needs to be submitted to the queue
376   *
377   * This function must be called when the scheduler mutex is taken
378   *
379   */
hw_queue_schedule_job(struct hl_cs_job * job)380  static void hw_queue_schedule_job(struct hl_cs_job *job)
381  {
382  	struct hl_device *hdev = job->cs->ctx->hdev;
383  	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
384  	u64 ptr;
385  	u32 offset, ctl, len;
386  
387  	/*
388  	 * Upon PQE completion, COMP_DATA is used as the write data to the
389  	 * completion queue (QMAN HBW message), and COMP_OFFSET is used as the
390  	 * write address offset in the SM block (QMAN LBW message).
391  	 * The write address offset is calculated as "COMP_OFFSET << 2".
392  	 */
393  	offset = job->cs->sequence & (hdev->asic_prop.max_pending_cs - 1);
394  	ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
395  		((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
396  
397  	len = job->job_cb_size;
398  
399  	/*
400  	 * A patched CB is created only if a user CB was allocated by driver and
401  	 * MMU is disabled. If MMU is enabled, the user CB should be used
402  	 * instead. If the user CB wasn't allocated by driver, assume that it
403  	 * holds an address.
404  	 */
405  	if (job->patched_cb)
406  		ptr = job->patched_cb->bus_address;
407  	else if (job->is_kernel_allocated_cb)
408  		ptr = job->user_cb->bus_address;
409  	else
410  		ptr = (u64) (uintptr_t) job->user_cb;
411  
412  	hl_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
413  }
414  
init_signal_cs(struct hl_device * hdev,struct hl_cs_job * job,struct hl_cs_compl * cs_cmpl)415  static int init_signal_cs(struct hl_device *hdev,
416  		struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
417  {
418  	struct hl_sync_stream_properties *prop;
419  	struct hl_hw_sob *hw_sob;
420  	u32 q_idx;
421  	int rc = 0;
422  
423  	q_idx = job->hw_queue_id;
424  	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
425  	hw_sob = &prop->hw_sob[prop->curr_sob_offset];
426  
427  	cs_cmpl->hw_sob = hw_sob;
428  	cs_cmpl->sob_val = prop->next_sob_val;
429  
430  	dev_dbg(hdev->dev,
431  		"generate signal CB, sob_id: %d, sob val: %u, q_idx: %d, seq: %llu\n",
432  		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx,
433  		cs_cmpl->cs_seq);
434  
435  	/* we set an EB since we must make sure all oeprations are done
436  	 * when sending the signal
437  	 */
438  	hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb,
439  				cs_cmpl->hw_sob->sob_id, 0, true);
440  
441  	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1,
442  								false);
443  
444  	job->cs->sob_addr_offset = hw_sob->sob_addr;
445  	job->cs->initial_sob_count = prop->next_sob_val - 1;
446  
447  	return rc;
448  }
449  
hl_hw_queue_encaps_sig_set_sob_info(struct hl_device * hdev,struct hl_cs * cs,struct hl_cs_job * job,struct hl_cs_compl * cs_cmpl)450  void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev,
451  			struct hl_cs *cs, struct hl_cs_job *job,
452  			struct hl_cs_compl *cs_cmpl)
453  {
454  	struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl;
455  	u32 offset = 0;
456  
457  	cs_cmpl->hw_sob = handle->hw_sob;
458  
459  	/* Note that encaps_sig_wait_offset was validated earlier in the flow
460  	 * for offset value which exceeds the max reserved signal count.
461  	 * always decrement 1 of the offset since when the user
462  	 * set offset 1 for example he mean to wait only for the first
463  	 * signal only, which will be pre_sob_val, and if he set offset 2
464  	 * then the value required is (pre_sob_val + 1) and so on...
465  	 * if user set wait offset to 0, then treat it as legacy wait cs,
466  	 * wait for the next signal.
467  	 */
468  	if (job->encaps_sig_wait_offset)
469  		offset = job->encaps_sig_wait_offset - 1;
470  
471  	cs_cmpl->sob_val = handle->pre_sob_val + offset;
472  }
473  
init_wait_cs(struct hl_device * hdev,struct hl_cs * cs,struct hl_cs_job * job,struct hl_cs_compl * cs_cmpl)474  static int init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
475  		struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
476  {
477  	struct hl_gen_wait_properties wait_prop;
478  	struct hl_sync_stream_properties *prop;
479  	struct hl_cs_compl *signal_cs_cmpl;
480  	u32 q_idx;
481  
482  	q_idx = job->hw_queue_id;
483  	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
484  
485  	signal_cs_cmpl = container_of(cs->signal_fence,
486  					struct hl_cs_compl,
487  					base_fence);
488  
489  	if (cs->encaps_signals) {
490  		/* use the encaps signal handle stored earlier in the flow
491  		 * and set the SOB information from the encaps
492  		 * signals handle
493  		 */
494  		hl_hw_queue_encaps_sig_set_sob_info(hdev, cs, job, cs_cmpl);
495  
496  		dev_dbg(hdev->dev, "Wait for encaps signals handle, qidx(%u), CS sequence(%llu), sob val: 0x%x, offset: %u\n",
497  				cs->encaps_sig_hdl->q_idx,
498  				cs->encaps_sig_hdl->cs_seq,
499  				cs_cmpl->sob_val,
500  				job->encaps_sig_wait_offset);
501  	} else {
502  		/* Copy the SOB id and value of the signal CS */
503  		cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
504  		cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
505  	}
506  
507  	/* check again if the signal cs already completed.
508  	 * if yes then don't send any wait cs since the hw_sob
509  	 * could be in reset already. if signal is not completed
510  	 * then get refcount to hw_sob to prevent resetting the sob
511  	 * while wait cs is not submitted.
512  	 * note that this check is protected by two locks,
513  	 * hw queue lock and completion object lock,
514  	 * and the same completion object lock also protects
515  	 * the hw_sob reset handler function.
516  	 * The hw_queue lock prevent out of sync of hw_sob
517  	 * refcount value, changed by signal/wait flows.
518  	 */
519  	spin_lock(&signal_cs_cmpl->lock);
520  
521  	if (completion_done(&cs->signal_fence->completion)) {
522  		spin_unlock(&signal_cs_cmpl->lock);
523  		return -EINVAL;
524  	}
525  
526  	kref_get(&cs_cmpl->hw_sob->kref);
527  
528  	spin_unlock(&signal_cs_cmpl->lock);
529  
530  	dev_dbg(hdev->dev,
531  		"generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d, seq: %llu\n",
532  		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
533  		prop->base_mon_id, q_idx, cs->sequence);
534  
535  	wait_prop.data = (void *) job->patched_cb;
536  	wait_prop.sob_base = cs_cmpl->hw_sob->sob_id;
537  	wait_prop.sob_mask = 0x1;
538  	wait_prop.sob_val = cs_cmpl->sob_val;
539  	wait_prop.mon_id = prop->base_mon_id;
540  	wait_prop.q_idx = q_idx;
541  	wait_prop.size = 0;
542  
543  	hdev->asic_funcs->gen_wait_cb(hdev, &wait_prop);
544  
545  	mb();
546  	hl_fence_put(cs->signal_fence);
547  	cs->signal_fence = NULL;
548  
549  	return 0;
550  }
551  
552  /*
553   * init_signal_wait_cs - initialize a signal/wait CS
554   * @cs: pointer to the signal/wait CS
555   *
556   * H/W queues spinlock should be taken before calling this function
557   */
init_signal_wait_cs(struct hl_cs * cs)558  static int init_signal_wait_cs(struct hl_cs *cs)
559  {
560  	struct hl_ctx *ctx = cs->ctx;
561  	struct hl_device *hdev = ctx->hdev;
562  	struct hl_cs_job *job;
563  	struct hl_cs_compl *cs_cmpl =
564  			container_of(cs->fence, struct hl_cs_compl, base_fence);
565  	int rc = 0;
566  
567  	/* There is only one job in a signal/wait CS */
568  	job = list_first_entry(&cs->job_list, struct hl_cs_job,
569  				cs_node);
570  
571  	if (cs->type & CS_TYPE_SIGNAL)
572  		rc = init_signal_cs(hdev, job, cs_cmpl);
573  	else if (cs->type & CS_TYPE_WAIT)
574  		rc = init_wait_cs(hdev, cs, job, cs_cmpl);
575  
576  	return rc;
577  }
578  
encaps_sig_first_staged_cs_handler(struct hl_device * hdev,struct hl_cs * cs)579  static int encaps_sig_first_staged_cs_handler
580  			(struct hl_device *hdev, struct hl_cs *cs)
581  {
582  	struct hl_cs_compl *cs_cmpl =
583  			container_of(cs->fence,
584  					struct hl_cs_compl, base_fence);
585  	struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
586  	struct hl_encaps_signals_mgr *mgr;
587  	int rc = 0;
588  
589  	mgr = &cs->ctx->sig_mgr;
590  
591  	spin_lock(&mgr->lock);
592  	encaps_sig_hdl = idr_find(&mgr->handles, cs->encaps_sig_hdl_id);
593  	if (encaps_sig_hdl) {
594  		/*
595  		 * Set handler CS sequence,
596  		 * the CS which contains the encapsulated signals.
597  		 */
598  		encaps_sig_hdl->cs_seq = cs->sequence;
599  		/* store the handle and set encaps signal indication,
600  		 * to be used later in cs_do_release to put the last
601  		 * reference to encaps signals handlers.
602  		 */
603  		cs_cmpl->encaps_signals = true;
604  		cs_cmpl->encaps_sig_hdl = encaps_sig_hdl;
605  
606  		/* set hw_sob pointer in completion object
607  		 * since it's used in cs_do_release flow to put
608  		 * refcount to sob
609  		 */
610  		cs_cmpl->hw_sob = encaps_sig_hdl->hw_sob;
611  		cs_cmpl->sob_val = encaps_sig_hdl->pre_sob_val +
612  						encaps_sig_hdl->count;
613  
614  		dev_dbg(hdev->dev, "CS seq (%llu) added to encaps signal handler id (%u), count(%u), qidx(%u), sob(%u), val(%u)\n",
615  				cs->sequence, encaps_sig_hdl->id,
616  				encaps_sig_hdl->count,
617  				encaps_sig_hdl->q_idx,
618  				cs_cmpl->hw_sob->sob_id,
619  				cs_cmpl->sob_val);
620  
621  	} else {
622  		dev_err(hdev->dev, "encaps handle id(%u) wasn't found!\n",
623  				cs->encaps_sig_hdl_id);
624  		rc = -EINVAL;
625  	}
626  
627  	spin_unlock(&mgr->lock);
628  
629  	return rc;
630  }
631  
632  /*
633   * hl_hw_queue_schedule_cs - schedule a command submission
634   * @cs: pointer to the CS
635   */
hl_hw_queue_schedule_cs(struct hl_cs * cs)636  int hl_hw_queue_schedule_cs(struct hl_cs *cs)
637  {
638  	enum hl_device_status status;
639  	struct hl_cs_counters_atomic *cntr;
640  	struct hl_ctx *ctx = cs->ctx;
641  	struct hl_device *hdev = ctx->hdev;
642  	struct hl_cs_job *job, *tmp;
643  	struct hl_hw_queue *q;
644  	int rc = 0, i, cq_cnt;
645  	bool first_entry;
646  	u32 max_queues;
647  
648  	cntr = &hdev->aggregated_cs_counters;
649  
650  	hdev->asic_funcs->hw_queues_lock(hdev);
651  
652  	if (!hl_device_operational(hdev, &status)) {
653  		atomic64_inc(&cntr->device_in_reset_drop_cnt);
654  		atomic64_inc(&ctx->cs_counters.device_in_reset_drop_cnt);
655  		dev_err(hdev->dev,
656  			"device is %s, CS rejected!\n", hdev->status[status]);
657  		rc = -EPERM;
658  		goto out;
659  	}
660  
661  	max_queues = hdev->asic_prop.max_queues;
662  
663  	q = &hdev->kernel_queues[0];
664  	for (i = 0, cq_cnt = 0 ; i < max_queues ; i++, q++) {
665  		if (cs->jobs_in_queue_cnt[i]) {
666  			switch (q->queue_type) {
667  			case QUEUE_TYPE_EXT:
668  				rc = ext_queue_sanity_checks(hdev, q,
669  						cs->jobs_in_queue_cnt[i],
670  						cs_needs_completion(cs) ?
671  								true : false);
672  				break;
673  			case QUEUE_TYPE_INT:
674  				rc = int_queue_sanity_checks(hdev, q,
675  						cs->jobs_in_queue_cnt[i]);
676  				break;
677  			case QUEUE_TYPE_HW:
678  				rc = hw_queue_sanity_checks(hdev, q,
679  						cs->jobs_in_queue_cnt[i]);
680  				break;
681  			default:
682  				dev_err(hdev->dev, "Queue type %d is invalid\n",
683  					q->queue_type);
684  				rc = -EINVAL;
685  				break;
686  			}
687  
688  			if (rc) {
689  				atomic64_inc(
690  					&ctx->cs_counters.queue_full_drop_cnt);
691  				atomic64_inc(&cntr->queue_full_drop_cnt);
692  				goto unroll_cq_resv;
693  			}
694  
695  			if (q->queue_type == QUEUE_TYPE_EXT)
696  				cq_cnt++;
697  		}
698  	}
699  
700  	if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT)) {
701  		rc = init_signal_wait_cs(cs);
702  		if (rc)
703  			goto unroll_cq_resv;
704  	} else if (cs->type == CS_TYPE_COLLECTIVE_WAIT) {
705  		rc = hdev->asic_funcs->collective_wait_init_cs(cs);
706  		if (rc)
707  			goto unroll_cq_resv;
708  	}
709  
710  	rc = hdev->asic_funcs->pre_schedule_cs(cs);
711  	if (rc) {
712  		dev_err(hdev->dev,
713  			"Failed in pre-submission operations of CS %d.%llu\n",
714  			ctx->asid, cs->sequence);
715  		goto unroll_cq_resv;
716  	}
717  
718  	hdev->shadow_cs_queue[cs->sequence &
719  				(hdev->asic_prop.max_pending_cs - 1)] = cs;
720  
721  	if (cs->encaps_signals && cs->staged_first) {
722  		rc = encaps_sig_first_staged_cs_handler(hdev, cs);
723  		if (rc)
724  			goto unroll_cq_resv;
725  	}
726  
727  	spin_lock(&hdev->cs_mirror_lock);
728  
729  	/* Verify staged CS exists and add to the staged list */
730  	if (cs->staged_cs && !cs->staged_first) {
731  		struct hl_cs *staged_cs;
732  
733  		staged_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence);
734  		if (!staged_cs) {
735  			dev_err(hdev->dev,
736  				"Cannot find staged submission sequence %llu",
737  				cs->staged_sequence);
738  			rc = -EINVAL;
739  			goto unlock_cs_mirror;
740  		}
741  
742  		if (is_staged_cs_last_exists(hdev, staged_cs)) {
743  			dev_err(hdev->dev,
744  				"Staged submission sequence %llu already submitted",
745  				cs->staged_sequence);
746  			rc = -EINVAL;
747  			goto unlock_cs_mirror;
748  		}
749  
750  		list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node);
751  
752  		/* update stream map of the first CS */
753  		if (hdev->supports_wait_for_multi_cs)
754  			staged_cs->fence->stream_master_qid_map |=
755  					cs->fence->stream_master_qid_map;
756  	}
757  
758  	list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
759  
760  	/* Queue TDR if the CS is the first entry and if timeout is wanted */
761  	first_entry = list_first_entry(&hdev->cs_mirror_list,
762  					struct hl_cs, mirror_node) == cs;
763  	if ((hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) &&
764  				first_entry && cs_needs_timeout(cs)) {
765  		cs->tdr_active = true;
766  		schedule_delayed_work(&cs->work_tdr, cs->timeout_jiffies);
767  
768  	}
769  
770  	spin_unlock(&hdev->cs_mirror_lock);
771  
772  	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
773  		switch (job->queue_type) {
774  		case QUEUE_TYPE_EXT:
775  			ext_queue_schedule_job(job);
776  			break;
777  		case QUEUE_TYPE_INT:
778  			int_queue_schedule_job(job);
779  			break;
780  		case QUEUE_TYPE_HW:
781  			hw_queue_schedule_job(job);
782  			break;
783  		default:
784  			break;
785  		}
786  
787  	cs->submitted = true;
788  
789  	goto out;
790  
791  unlock_cs_mirror:
792  	spin_unlock(&hdev->cs_mirror_lock);
793  unroll_cq_resv:
794  	q = &hdev->kernel_queues[0];
795  	for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) {
796  		if ((q->queue_type == QUEUE_TYPE_EXT) &&
797  						(cs->jobs_in_queue_cnt[i])) {
798  			atomic_t *free_slots =
799  				&hdev->completion_queue[i].free_slots_cnt;
800  			atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
801  			cq_cnt--;
802  		}
803  	}
804  
805  out:
806  	hdev->asic_funcs->hw_queues_unlock(hdev);
807  
808  	return rc;
809  }
810  
811  /*
812   * hl_hw_queue_inc_ci_kernel - increment ci for kernel's queue
813   *
814   * @hdev: pointer to hl_device structure
815   * @hw_queue_id: which queue to increment its ci
816   */
hl_hw_queue_inc_ci_kernel(struct hl_device * hdev,u32 hw_queue_id)817  void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
818  {
819  	struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
820  
821  	atomic_inc(&q->ci);
822  }
823  
ext_and_cpu_queue_init(struct hl_device * hdev,struct hl_hw_queue * q,bool is_cpu_queue)824  static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
825  					bool is_cpu_queue)
826  {
827  	void *p;
828  	int rc;
829  
830  	if (is_cpu_queue)
831  		p = hl_cpu_accessible_dma_pool_alloc(hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address);
832  	else
833  		p = hl_asic_dma_alloc_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address,
834  						GFP_KERNEL | __GFP_ZERO);
835  	if (!p)
836  		return -ENOMEM;
837  
838  	q->kernel_address = p;
839  
840  	q->shadow_queue = kmalloc_array(HL_QUEUE_LENGTH, sizeof(struct hl_cs_job *), GFP_KERNEL);
841  	if (!q->shadow_queue) {
842  		dev_err(hdev->dev,
843  			"Failed to allocate shadow queue for H/W queue %d\n",
844  			q->hw_queue_id);
845  		rc = -ENOMEM;
846  		goto free_queue;
847  	}
848  
849  	/* Make sure read/write pointers are initialized to start of queue */
850  	atomic_set(&q->ci, 0);
851  	q->pi = 0;
852  
853  	return 0;
854  
855  free_queue:
856  	if (is_cpu_queue)
857  		hl_cpu_accessible_dma_pool_free(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address);
858  	else
859  		hl_asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address,
860  						q->bus_address);
861  
862  	return rc;
863  }
864  
int_queue_init(struct hl_device * hdev,struct hl_hw_queue * q)865  static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
866  {
867  	void *p;
868  
869  	p = hdev->asic_funcs->get_int_queue_base(hdev, q->hw_queue_id,
870  					&q->bus_address, &q->int_queue_len);
871  	if (!p) {
872  		dev_err(hdev->dev,
873  			"Failed to get base address for internal queue %d\n",
874  			q->hw_queue_id);
875  		return -EFAULT;
876  	}
877  
878  	q->kernel_address = p;
879  	q->pi = 0;
880  	atomic_set(&q->ci, 0);
881  
882  	return 0;
883  }
884  
cpu_queue_init(struct hl_device * hdev,struct hl_hw_queue * q)885  static int cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
886  {
887  	return ext_and_cpu_queue_init(hdev, q, true);
888  }
889  
ext_queue_init(struct hl_device * hdev,struct hl_hw_queue * q)890  static int ext_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
891  {
892  	return ext_and_cpu_queue_init(hdev, q, false);
893  }
894  
hw_queue_init(struct hl_device * hdev,struct hl_hw_queue * q)895  static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
896  {
897  	void *p;
898  
899  	p = hl_asic_dma_alloc_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, &q->bus_address,
900  					GFP_KERNEL | __GFP_ZERO);
901  	if (!p)
902  		return -ENOMEM;
903  
904  	q->kernel_address = p;
905  
906  	/* Make sure read/write pointers are initialized to start of queue */
907  	atomic_set(&q->ci, 0);
908  	q->pi = 0;
909  
910  	return 0;
911  }
912  
sync_stream_queue_init(struct hl_device * hdev,u32 q_idx)913  static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx)
914  {
915  	struct hl_sync_stream_properties *sync_stream_prop;
916  	struct asic_fixed_properties *prop = &hdev->asic_prop;
917  	struct hl_hw_sob *hw_sob;
918  	int sob, reserved_mon_idx, queue_idx;
919  
920  	sync_stream_prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
921  
922  	/* We use 'collective_mon_idx' as a running index in order to reserve
923  	 * monitors for collective master/slave queues.
924  	 * collective master queue gets 2 reserved monitors
925  	 * collective slave queue gets 1 reserved monitor
926  	 */
927  	if (hdev->kernel_queues[q_idx].collective_mode ==
928  			HL_COLLECTIVE_MASTER) {
929  		reserved_mon_idx = hdev->collective_mon_idx;
930  
931  		/* reserve the first monitor for collective master queue */
932  		sync_stream_prop->collective_mstr_mon_id[0] =
933  			prop->collective_first_mon + reserved_mon_idx;
934  
935  		/* reserve the second monitor for collective master queue */
936  		sync_stream_prop->collective_mstr_mon_id[1] =
937  			prop->collective_first_mon + reserved_mon_idx + 1;
938  
939  		hdev->collective_mon_idx += HL_COLLECTIVE_RSVD_MSTR_MONS;
940  	} else if (hdev->kernel_queues[q_idx].collective_mode ==
941  			HL_COLLECTIVE_SLAVE) {
942  		reserved_mon_idx = hdev->collective_mon_idx++;
943  
944  		/* reserve a monitor for collective slave queue */
945  		sync_stream_prop->collective_slave_mon_id =
946  			prop->collective_first_mon + reserved_mon_idx;
947  	}
948  
949  	if (!hdev->kernel_queues[q_idx].supports_sync_stream)
950  		return;
951  
952  	queue_idx = hdev->sync_stream_queue_idx++;
953  
954  	sync_stream_prop->base_sob_id = prop->sync_stream_first_sob +
955  			(queue_idx * HL_RSVD_SOBS);
956  	sync_stream_prop->base_mon_id = prop->sync_stream_first_mon +
957  			(queue_idx * HL_RSVD_MONS);
958  	sync_stream_prop->next_sob_val = 1;
959  	sync_stream_prop->curr_sob_offset = 0;
960  
961  	for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) {
962  		hw_sob = &sync_stream_prop->hw_sob[sob];
963  		hw_sob->hdev = hdev;
964  		hw_sob->sob_id = sync_stream_prop->base_sob_id + sob;
965  		hw_sob->sob_addr =
966  			hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
967  		hw_sob->q_idx = q_idx;
968  		kref_init(&hw_sob->kref);
969  	}
970  }
971  
sync_stream_queue_reset(struct hl_device * hdev,u32 q_idx)972  static void sync_stream_queue_reset(struct hl_device *hdev, u32 q_idx)
973  {
974  	struct hl_sync_stream_properties *prop =
975  			&hdev->kernel_queues[q_idx].sync_stream_prop;
976  
977  	/*
978  	 * In case we got here due to a stuck CS, the refcnt might be bigger
979  	 * than 1 and therefore we reset it.
980  	 */
981  	kref_init(&prop->hw_sob[prop->curr_sob_offset].kref);
982  	prop->curr_sob_offset = 0;
983  	prop->next_sob_val = 1;
984  }
985  
986  /*
987   * queue_init - main initialization function for H/W queue object
988   *
989   * @hdev: pointer to hl_device device structure
990   * @q: pointer to hl_hw_queue queue structure
991   * @hw_queue_id: The id of the H/W queue
992   *
993   * Allocate dma-able memory for the queue and initialize fields
994   * Returns 0 on success
995   */
queue_init(struct hl_device * hdev,struct hl_hw_queue * q,u32 hw_queue_id)996  static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
997  			u32 hw_queue_id)
998  {
999  	int rc;
1000  
1001  	q->hw_queue_id = hw_queue_id;
1002  
1003  	switch (q->queue_type) {
1004  	case QUEUE_TYPE_EXT:
1005  		rc = ext_queue_init(hdev, q);
1006  		break;
1007  	case QUEUE_TYPE_INT:
1008  		rc = int_queue_init(hdev, q);
1009  		break;
1010  	case QUEUE_TYPE_CPU:
1011  		rc = cpu_queue_init(hdev, q);
1012  		break;
1013  	case QUEUE_TYPE_HW:
1014  		rc = hw_queue_init(hdev, q);
1015  		break;
1016  	case QUEUE_TYPE_NA:
1017  		q->valid = 0;
1018  		return 0;
1019  	default:
1020  		dev_crit(hdev->dev, "wrong queue type %d during init\n",
1021  			q->queue_type);
1022  		rc = -EINVAL;
1023  		break;
1024  	}
1025  
1026  	sync_stream_queue_init(hdev, q->hw_queue_id);
1027  
1028  	if (rc)
1029  		return rc;
1030  
1031  	q->valid = 1;
1032  
1033  	return 0;
1034  }
1035  
1036  /*
1037   * hw_queue_fini - destroy queue
1038   *
1039   * @hdev: pointer to hl_device device structure
1040   * @q: pointer to hl_hw_queue queue structure
1041   *
1042   * Free the queue memory
1043   */
queue_fini(struct hl_device * hdev,struct hl_hw_queue * q)1044  static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
1045  {
1046  	if (!q->valid)
1047  		return;
1048  
1049  	/*
1050  	 * If we arrived here, there are no jobs waiting on this queue
1051  	 * so we can safely remove it.
1052  	 * This is because this function can only called when:
1053  	 * 1. Either a context is deleted, which only can occur if all its
1054  	 *    jobs were finished
1055  	 * 2. A context wasn't able to be created due to failure or timeout,
1056  	 *    which means there are no jobs on the queue yet
1057  	 *
1058  	 * The only exception are the queues of the kernel context, but
1059  	 * if they are being destroyed, it means that the entire module is
1060  	 * being removed. If the module is removed, it means there is no open
1061  	 * user context. It also means that if a job was submitted by
1062  	 * the kernel driver (e.g. context creation), the job itself was
1063  	 * released by the kernel driver when a timeout occurred on its
1064  	 * Completion. Thus, we don't need to release it again.
1065  	 */
1066  
1067  	if (q->queue_type == QUEUE_TYPE_INT)
1068  		return;
1069  
1070  	kfree(q->shadow_queue);
1071  
1072  	if (q->queue_type == QUEUE_TYPE_CPU)
1073  		hl_cpu_accessible_dma_pool_free(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address);
1074  	else
1075  		hl_asic_dma_free_coherent(hdev, HL_QUEUE_SIZE_IN_BYTES, q->kernel_address,
1076  						q->bus_address);
1077  }
1078  
hl_hw_queues_create(struct hl_device * hdev)1079  int hl_hw_queues_create(struct hl_device *hdev)
1080  {
1081  	struct asic_fixed_properties *asic = &hdev->asic_prop;
1082  	struct hl_hw_queue *q;
1083  	int i, rc, q_ready_cnt;
1084  
1085  	hdev->kernel_queues = kcalloc(asic->max_queues,
1086  				sizeof(*hdev->kernel_queues), GFP_KERNEL);
1087  
1088  	if (!hdev->kernel_queues) {
1089  		dev_err(hdev->dev, "Not enough memory for H/W queues\n");
1090  		return -ENOMEM;
1091  	}
1092  
1093  	/* Initialize the H/W queues */
1094  	for (i = 0, q_ready_cnt = 0, q = hdev->kernel_queues;
1095  			i < asic->max_queues ; i++, q_ready_cnt++, q++) {
1096  
1097  		q->queue_type = asic->hw_queues_props[i].type;
1098  		q->supports_sync_stream =
1099  				asic->hw_queues_props[i].supports_sync_stream;
1100  		q->collective_mode = asic->hw_queues_props[i].collective_mode;
1101  		q->dram_bd = asic->hw_queues_props[i].dram_bd;
1102  
1103  		rc = queue_init(hdev, q, i);
1104  		if (rc) {
1105  			dev_err(hdev->dev,
1106  				"failed to initialize queue %d\n", i);
1107  			goto release_queues;
1108  		}
1109  
1110  		/* Set DRAM PQ address for the queue if it should be at DRAM */
1111  		if (q->dram_bd)
1112  			q->pq_dram_address = asic->hw_queues_props[i].q_dram_bd_address;
1113  	}
1114  
1115  	return 0;
1116  
1117  release_queues:
1118  	for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++)
1119  		queue_fini(hdev, q);
1120  
1121  	kfree(hdev->kernel_queues);
1122  
1123  	return rc;
1124  }
1125  
hl_hw_queues_destroy(struct hl_device * hdev)1126  void hl_hw_queues_destroy(struct hl_device *hdev)
1127  {
1128  	struct hl_hw_queue *q;
1129  	u32 max_queues = hdev->asic_prop.max_queues;
1130  	int i;
1131  
1132  	for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++)
1133  		queue_fini(hdev, q);
1134  
1135  	kfree(hdev->kernel_queues);
1136  }
1137  
hl_hw_queue_reset(struct hl_device * hdev,bool hard_reset)1138  void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset)
1139  {
1140  	struct hl_hw_queue *q;
1141  	u32 max_queues = hdev->asic_prop.max_queues;
1142  	int i;
1143  
1144  	for (i = 0, q = hdev->kernel_queues ; i < max_queues ; i++, q++) {
1145  		if ((!q->valid) ||
1146  			((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU)))
1147  			continue;
1148  		q->pi = 0;
1149  		atomic_set(&q->ci, 0);
1150  
1151  		if (q->supports_sync_stream)
1152  			sync_stream_queue_reset(hdev, q->hw_queue_id);
1153  	}
1154  }
1155