1 // SPDX-License-Identifier: GPL-2.0
2
3 /*
4 * Copyright 2016-2021 HabanaLabs, Ltd.
5 * All Rights Reserved.
6 *
7 */
8
9 #define pr_fmt(fmt) "habanalabs: " fmt
10
11 #include "habanalabs.h"
12 #include "../include/hw_ip/pci/pci_general.h"
13
14 #include <linux/pci.h>
15 #include <linux/module.h>
16 #include <linux/vmalloc.h>
17 #include <linux/version.h>
18
19 #include <drm/drm_accel.h>
20 #include <drm/drm_drv.h>
21 #include <drm/drm_ioctl.h>
22
23 #define CREATE_TRACE_POINTS
24 #include <trace/events/habanalabs.h>
25
26 #define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
27
28 #define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
29
30 MODULE_AUTHOR(HL_DRIVER_AUTHOR);
31 MODULE_DESCRIPTION(HL_DRIVER_DESC);
32 MODULE_LICENSE("GPL v2");
33
34 static int hl_major;
35 static DEFINE_IDR(hl_devs_idr);
36 static DEFINE_MUTEX(hl_devs_idr_lock);
37
38 #define HL_DEFAULT_TIMEOUT_LOCKED 30 /* 30 seconds */
39 #define GAUDI_DEFAULT_TIMEOUT_LOCKED 600 /* 10 minutes */
40
41 static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
42 static int reset_on_lockup = 1;
43 static int memory_scrub;
44 static ulong boot_error_status_mask = ULONG_MAX;
45
46 module_param(timeout_locked, int, 0444);
47 MODULE_PARM_DESC(timeout_locked,
48 "Device lockup timeout in seconds (0 = disabled, default 30s)");
49
50 module_param(reset_on_lockup, int, 0444);
51 MODULE_PARM_DESC(reset_on_lockup,
52 "Do device reset on lockup (0 = no, 1 = yes, default yes)");
53
54 module_param(memory_scrub, int, 0444);
55 MODULE_PARM_DESC(memory_scrub,
56 "Scrub device memory in various states (0 = no, 1 = yes, default no)");
57
58 module_param(boot_error_status_mask, ulong, 0444);
59 MODULE_PARM_DESC(boot_error_status_mask,
60 "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
61
62 #define PCI_IDS_GOYA 0x0001
63 #define PCI_IDS_GAUDI 0x1000
64 #define PCI_IDS_GAUDI_SEC 0x1010
65
66 #define PCI_IDS_GAUDI2 0x1020
67
68 static const struct pci_device_id ids[] = {
69 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
70 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
71 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI_SEC), },
72 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI2), },
73 { 0, }
74 };
75 MODULE_DEVICE_TABLE(pci, ids);
76
77 static const struct drm_ioctl_desc hl_drm_ioctls[] = {
78 DRM_IOCTL_DEF_DRV(HL_INFO, hl_info_ioctl, 0),
79 DRM_IOCTL_DEF_DRV(HL_CB, hl_cb_ioctl, 0),
80 DRM_IOCTL_DEF_DRV(HL_CS, hl_cs_ioctl, 0),
81 DRM_IOCTL_DEF_DRV(HL_WAIT_CS, hl_wait_ioctl, 0),
82 DRM_IOCTL_DEF_DRV(HL_MEMORY, hl_mem_ioctl, 0),
83 DRM_IOCTL_DEF_DRV(HL_DEBUG, hl_debug_ioctl, 0),
84 };
85
86 static const struct file_operations hl_fops = {
87 .owner = THIS_MODULE,
88 .open = accel_open,
89 .release = drm_release,
90 .unlocked_ioctl = drm_ioctl,
91 .compat_ioctl = drm_compat_ioctl,
92 .llseek = noop_llseek,
93 .mmap = hl_mmap
94 };
95
96 static const struct drm_driver hl_driver = {
97 .driver_features = DRIVER_COMPUTE_ACCEL,
98
99 .name = HL_NAME,
100 .desc = HL_DRIVER_DESC,
101 .major = LINUX_VERSION_MAJOR,
102 .minor = LINUX_VERSION_PATCHLEVEL,
103 .patchlevel = LINUX_VERSION_SUBLEVEL,
104 .date = "20190505",
105
106 .fops = &hl_fops,
107 .open = hl_device_open,
108 .postclose = hl_device_release,
109 .ioctls = hl_drm_ioctls,
110 .num_ioctls = ARRAY_SIZE(hl_drm_ioctls)
111 };
112
113 /*
114 * get_asic_type - translate device id to asic type
115 *
116 * @hdev: pointer to habanalabs device structure.
117 *
118 * Translate device id and revision id to asic type.
119 * In case of unidentified device, return -1
120 */
get_asic_type(struct hl_device * hdev)121 static enum hl_asic_type get_asic_type(struct hl_device *hdev)
122 {
123 struct pci_dev *pdev = hdev->pdev;
124 enum hl_asic_type asic_type = ASIC_INVALID;
125
126 switch (pdev->device) {
127 case PCI_IDS_GOYA:
128 asic_type = ASIC_GOYA;
129 break;
130 case PCI_IDS_GAUDI:
131 asic_type = ASIC_GAUDI;
132 break;
133 case PCI_IDS_GAUDI_SEC:
134 asic_type = ASIC_GAUDI_SEC;
135 break;
136 case PCI_IDS_GAUDI2:
137 switch (pdev->revision) {
138 case REV_ID_A:
139 asic_type = ASIC_GAUDI2;
140 break;
141 case REV_ID_B:
142 asic_type = ASIC_GAUDI2B;
143 break;
144 case REV_ID_C:
145 asic_type = ASIC_GAUDI2C;
146 break;
147 case REV_ID_D:
148 asic_type = ASIC_GAUDI2D;
149 break;
150 default:
151 break;
152 }
153 break;
154 default:
155 break;
156 }
157
158 return asic_type;
159 }
160
is_asic_secured(enum hl_asic_type asic_type)161 static bool is_asic_secured(enum hl_asic_type asic_type)
162 {
163 switch (asic_type) {
164 case ASIC_GAUDI_SEC:
165 return true;
166 default:
167 return false;
168 }
169 }
170
171 /*
172 * hl_device_open() - open function for habanalabs device.
173 * @ddev: pointer to DRM device structure.
174 * @file: pointer to DRM file private data structure.
175 *
176 * Called when process opens an habanalabs device.
177 */
hl_device_open(struct drm_device * ddev,struct drm_file * file_priv)178 int hl_device_open(struct drm_device *ddev, struct drm_file *file_priv)
179 {
180 struct hl_device *hdev = to_hl_device(ddev);
181 enum hl_device_status status;
182 struct hl_fpriv *hpriv;
183 int rc;
184
185 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
186 if (!hpriv)
187 return -ENOMEM;
188
189 hpriv->hdev = hdev;
190 mutex_init(&hpriv->notifier_event.lock);
191 mutex_init(&hpriv->restore_phase_mutex);
192 mutex_init(&hpriv->ctx_lock);
193 kref_init(&hpriv->refcount);
194
195 hl_ctx_mgr_init(&hpriv->ctx_mgr);
196 hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
197
198 hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
199
200 mutex_lock(&hdev->fpriv_list_lock);
201
202 if (!hl_device_operational(hdev, &status)) {
203 dev_dbg_ratelimited(hdev->dev,
204 "Can't open %s because it is %s\n",
205 dev_name(hdev->dev), hdev->status[status]);
206
207 if (status == HL_DEVICE_STATUS_IN_RESET ||
208 status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
209 rc = -EAGAIN;
210 else
211 rc = -EPERM;
212
213 goto out_err;
214 }
215
216 if (hdev->is_in_dram_scrub) {
217 dev_dbg_ratelimited(hdev->dev,
218 "Can't open %s during dram scrub\n",
219 dev_name(hdev->dev));
220 rc = -EAGAIN;
221 goto out_err;
222 }
223
224 if (hdev->compute_ctx_in_release) {
225 dev_dbg_ratelimited(hdev->dev,
226 "Can't open %s because another user is still releasing it\n",
227 dev_name(hdev->dev));
228 rc = -EAGAIN;
229 goto out_err;
230 }
231
232 if (hdev->is_compute_ctx_active) {
233 dev_dbg_ratelimited(hdev->dev,
234 "Can't open %s because another user is working on it\n",
235 dev_name(hdev->dev));
236 rc = -EBUSY;
237 goto out_err;
238 }
239
240 rc = hl_ctx_create(hdev, hpriv);
241 if (rc) {
242 dev_err(hdev->dev, "Failed to create context %d\n", rc);
243 goto out_err;
244 }
245
246 list_add(&hpriv->dev_node, &hdev->fpriv_list);
247 mutex_unlock(&hdev->fpriv_list_lock);
248
249 hdev->asic_funcs->send_device_activity(hdev, true);
250
251 hl_debugfs_add_file(hpriv);
252
253 hl_enable_err_info_capture(&hdev->captured_err_info);
254
255 hdev->open_counter++;
256 hdev->last_successful_open_jif = jiffies;
257 hdev->last_successful_open_ktime = ktime_get();
258
259 file_priv->driver_priv = hpriv;
260 hpriv->file_priv = file_priv;
261
262 return 0;
263
264 out_err:
265 mutex_unlock(&hdev->fpriv_list_lock);
266 hl_mem_mgr_fini(&hpriv->mem_mgr, NULL);
267 hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
268 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
269 mutex_destroy(&hpriv->ctx_lock);
270 mutex_destroy(&hpriv->restore_phase_mutex);
271 mutex_destroy(&hpriv->notifier_event.lock);
272 put_pid(hpriv->taskpid);
273
274 kfree(hpriv);
275
276 return rc;
277 }
278
hl_device_open_ctrl(struct inode * inode,struct file * filp)279 int hl_device_open_ctrl(struct inode *inode, struct file *filp)
280 {
281 struct hl_device *hdev;
282 struct hl_fpriv *hpriv;
283 int rc;
284
285 mutex_lock(&hl_devs_idr_lock);
286 hdev = idr_find(&hl_devs_idr, iminor(inode));
287 mutex_unlock(&hl_devs_idr_lock);
288
289 if (!hdev) {
290 pr_err("Couldn't find device %d:%d\n",
291 imajor(inode), iminor(inode));
292 return -ENXIO;
293 }
294
295 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
296 if (!hpriv)
297 return -ENOMEM;
298
299 /* Prevent other routines from reading partial hpriv data by
300 * initializing hpriv fields before inserting it to the list
301 */
302 hpriv->hdev = hdev;
303 filp->private_data = hpriv;
304
305 nonseekable_open(inode, filp);
306
307 hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
308
309 mutex_lock(&hdev->fpriv_ctrl_list_lock);
310
311 if (!hl_ctrl_device_operational(hdev, NULL)) {
312 dev_dbg_ratelimited(hdev->dev_ctrl,
313 "Can't open %s because it is disabled\n",
314 dev_name(hdev->dev_ctrl));
315 rc = -EPERM;
316 goto out_err;
317 }
318
319 list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
320 mutex_unlock(&hdev->fpriv_ctrl_list_lock);
321
322 return 0;
323
324 out_err:
325 mutex_unlock(&hdev->fpriv_ctrl_list_lock);
326 filp->private_data = NULL;
327 put_pid(hpriv->taskpid);
328
329 kfree(hpriv);
330
331 return rc;
332 }
333
set_driver_behavior_per_device(struct hl_device * hdev)334 static void set_driver_behavior_per_device(struct hl_device *hdev)
335 {
336 hdev->nic_ports_mask = 0;
337 hdev->fw_components = FW_TYPE_ALL_TYPES;
338 hdev->cpu_queues_enable = 1;
339 hdev->pldm = 0;
340 hdev->hard_reset_on_fw_events = 1;
341 hdev->bmc_enable = 1;
342 hdev->reset_on_preboot_fail = 1;
343 hdev->heartbeat = 1;
344 }
345
copy_kernel_module_params_to_device(struct hl_device * hdev)346 static void copy_kernel_module_params_to_device(struct hl_device *hdev)
347 {
348 hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
349
350 hdev->major = hl_major;
351 hdev->memory_scrub = memory_scrub;
352 hdev->reset_on_lockup = reset_on_lockup;
353 hdev->boot_error_status_mask = boot_error_status_mask;
354 }
355
fixup_device_params_per_asic(struct hl_device * hdev,int timeout)356 static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
357 {
358 switch (hdev->asic_type) {
359 case ASIC_GAUDI:
360 case ASIC_GAUDI_SEC:
361 /* If user didn't request a different timeout than the default one, we have
362 * a different default timeout for Gaudi
363 */
364 if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
365 hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
366 MSEC_PER_SEC);
367
368 hdev->reset_upon_device_release = 0;
369 break;
370
371 case ASIC_GOYA:
372 hdev->reset_upon_device_release = 0;
373 break;
374
375 default:
376 hdev->reset_upon_device_release = 1;
377 break;
378 }
379 }
380
fixup_device_params(struct hl_device * hdev)381 static int fixup_device_params(struct hl_device *hdev)
382 {
383 int tmp_timeout;
384
385 tmp_timeout = timeout_locked;
386
387 hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
388 hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
389
390 if (tmp_timeout)
391 hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
392 else
393 hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
394
395 hdev->stop_on_err = true;
396 hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
397 hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
398
399 /* Enable only after the initialization of the device */
400 hdev->disabled = true;
401
402 if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
403 (hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
404 pr_err("Preboot must be set along with other components");
405 return -EINVAL;
406 }
407
408 /* If CPU queues not enabled, no way to do heartbeat */
409 if (!hdev->cpu_queues_enable)
410 hdev->heartbeat = 0;
411 fixup_device_params_per_asic(hdev, tmp_timeout);
412
413 return 0;
414 }
415
allocate_device_id(struct hl_device * hdev)416 static int allocate_device_id(struct hl_device *hdev)
417 {
418 int id;
419
420 mutex_lock(&hl_devs_idr_lock);
421 id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
422 mutex_unlock(&hl_devs_idr_lock);
423
424 if (id < 0) {
425 if (id == -ENOSPC)
426 pr_err("too many devices in the system\n");
427 return -EBUSY;
428 }
429
430 hdev->id = id;
431
432 /*
433 * Firstly initialized with the internal device ID.
434 * Will be updated later after the DRM device registration to hold the minor ID.
435 */
436 hdev->cdev_idx = hdev->id;
437
438 return 0;
439 }
440
441 /**
442 * create_hdev - create habanalabs device instance
443 *
444 * @dev: will hold the pointer to the new habanalabs device structure
445 * @pdev: pointer to the pci device
446 *
447 * Allocate memory for habanalabs device and initialize basic fields
448 * Identify the ASIC type
449 * Allocate ID (minor) for the device (only for real devices)
450 */
create_hdev(struct hl_device ** dev,struct pci_dev * pdev)451 static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
452 {
453 struct hl_device *hdev;
454 int rc;
455
456 *dev = NULL;
457
458 hdev = devm_drm_dev_alloc(&pdev->dev, &hl_driver, struct hl_device, drm);
459 if (IS_ERR(hdev))
460 return PTR_ERR(hdev);
461
462 hdev->dev = hdev->drm.dev;
463
464 /* Will be NULL in case of simulator device */
465 hdev->pdev = pdev;
466
467 /* Assign status description string */
468 strscpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
469 strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
470 strscpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
471 strscpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
472 strscpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
473 "in device creation", HL_STR_MAX);
474 strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE],
475 "in reset after device release", HL_STR_MAX);
476
477
478 /* First, we must find out which ASIC are we handling. This is needed
479 * to configure the behavior of the driver (kernel parameters)
480 */
481 hdev->asic_type = get_asic_type(hdev);
482 if (hdev->asic_type == ASIC_INVALID) {
483 dev_err(&pdev->dev, "Unsupported ASIC\n");
484 rc = -ENODEV;
485 goto out_err;
486 }
487
488 copy_kernel_module_params_to_device(hdev);
489
490 set_driver_behavior_per_device(hdev);
491
492 fixup_device_params(hdev);
493
494 rc = allocate_device_id(hdev);
495 if (rc)
496 goto out_err;
497
498 *dev = hdev;
499
500 return 0;
501
502 out_err:
503 return rc;
504 }
505
506 /*
507 * destroy_hdev - destroy habanalabs device instance
508 *
509 * @dev: pointer to the habanalabs device structure
510 *
511 */
destroy_hdev(struct hl_device * hdev)512 static void destroy_hdev(struct hl_device *hdev)
513 {
514 /* Remove device from the device list */
515 mutex_lock(&hl_devs_idr_lock);
516 idr_remove(&hl_devs_idr, hdev->id);
517 mutex_unlock(&hl_devs_idr_lock);
518
519 }
520
hl_pmops_suspend(struct device * dev)521 static int hl_pmops_suspend(struct device *dev)
522 {
523 struct hl_device *hdev = dev_get_drvdata(dev);
524
525 pr_debug("Going to suspend PCI device\n");
526
527 if (!hdev) {
528 pr_err("device pointer is NULL in suspend\n");
529 return 0;
530 }
531
532 return hl_device_suspend(hdev);
533 }
534
hl_pmops_resume(struct device * dev)535 static int hl_pmops_resume(struct device *dev)
536 {
537 struct hl_device *hdev = dev_get_drvdata(dev);
538
539 pr_debug("Going to resume PCI device\n");
540
541 if (!hdev) {
542 pr_err("device pointer is NULL in resume\n");
543 return 0;
544 }
545
546 return hl_device_resume(hdev);
547 }
548
549 /**
550 * hl_pci_probe - probe PCI habanalabs devices
551 *
552 * @pdev: pointer to pci device
553 * @id: pointer to pci device id structure
554 *
555 * Standard PCI probe function for habanalabs device.
556 * Create a new habanalabs device and initialize it according to the
557 * device's type
558 */
hl_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)559 static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
560 {
561 struct hl_device *hdev;
562 int rc;
563
564 dev_info(&pdev->dev, HL_NAME
565 " device found [%04x:%04x] (rev %x)\n",
566 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
567
568 rc = create_hdev(&hdev, pdev);
569 if (rc)
570 return rc;
571
572 pci_set_drvdata(pdev, hdev);
573
574 rc = hl_device_init(hdev);
575 if (rc) {
576 dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
577 rc = -ENODEV;
578 goto disable_device;
579 }
580
581 return 0;
582
583 disable_device:
584 pci_set_drvdata(pdev, NULL);
585 destroy_hdev(hdev);
586
587 return rc;
588 }
589
590 /*
591 * hl_pci_remove - remove PCI habanalabs devices
592 *
593 * @pdev: pointer to pci device
594 *
595 * Standard PCI remove function for habanalabs device
596 */
hl_pci_remove(struct pci_dev * pdev)597 static void hl_pci_remove(struct pci_dev *pdev)
598 {
599 struct hl_device *hdev;
600
601 hdev = pci_get_drvdata(pdev);
602 if (!hdev)
603 return;
604
605 hl_device_fini(hdev);
606 pci_set_drvdata(pdev, NULL);
607 destroy_hdev(hdev);
608 }
609
610 /**
611 * hl_pci_err_detected - a PCI bus error detected on this device
612 *
613 * @pdev: pointer to pci device
614 * @state: PCI error type
615 *
616 * Called by the PCI subsystem whenever a non-correctable
617 * PCI bus error is detected
618 */
619 static pci_ers_result_t
hl_pci_err_detected(struct pci_dev * pdev,pci_channel_state_t state)620 hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
621 {
622 struct hl_device *hdev = pci_get_drvdata(pdev);
623 enum pci_ers_result result;
624
625 switch (state) {
626 case pci_channel_io_normal:
627 dev_warn(hdev->dev, "PCI normal state error detected\n");
628 return PCI_ERS_RESULT_CAN_RECOVER;
629
630 case pci_channel_io_frozen:
631 dev_warn(hdev->dev, "PCI frozen state error detected\n");
632 result = PCI_ERS_RESULT_NEED_RESET;
633 break;
634
635 case pci_channel_io_perm_failure:
636 dev_warn(hdev->dev, "PCI failure state error detected\n");
637 result = PCI_ERS_RESULT_DISCONNECT;
638 break;
639
640 default:
641 result = PCI_ERS_RESULT_NONE;
642 }
643
644 hdev->asic_funcs->halt_engines(hdev, true, false);
645
646 return result;
647 }
648
649 /**
650 * hl_pci_err_resume - resume after a PCI slot reset
651 *
652 * @pdev: pointer to pci device
653 *
654 */
hl_pci_err_resume(struct pci_dev * pdev)655 static void hl_pci_err_resume(struct pci_dev *pdev)
656 {
657 struct hl_device *hdev = pci_get_drvdata(pdev);
658
659 dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
660 hl_device_resume(hdev);
661 }
662
663 /**
664 * hl_pci_err_slot_reset - a PCI slot reset has just happened
665 *
666 * @pdev: pointer to pci device
667 *
668 * Determine if the driver can recover from the PCI slot reset
669 */
hl_pci_err_slot_reset(struct pci_dev * pdev)670 static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
671 {
672 struct hl_device *hdev = pci_get_drvdata(pdev);
673
674 dev_warn(hdev->dev, "PCI slot reset detected\n");
675
676 return PCI_ERS_RESULT_RECOVERED;
677 }
678
hl_pci_reset_prepare(struct pci_dev * pdev)679 static void hl_pci_reset_prepare(struct pci_dev *pdev)
680 {
681 struct hl_device *hdev;
682
683 hdev = pci_get_drvdata(pdev);
684 if (!hdev)
685 return;
686
687 hdev->disabled = true;
688 }
689
hl_pci_reset_done(struct pci_dev * pdev)690 static void hl_pci_reset_done(struct pci_dev *pdev)
691 {
692 struct hl_device *hdev;
693 u32 flags;
694
695 hdev = pci_get_drvdata(pdev);
696 if (!hdev)
697 return;
698
699 /*
700 * Schedule a thread to trigger hard reset.
701 * The reason for this handler, is for rare cases where the driver is up
702 * and FLR occurs. This is valid only when working with no VM, so FW handles FLR
703 * and resets the device. FW will go back preboot stage, so driver needs to perform
704 * hard reset in order to load FW fit again.
705 */
706 flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW;
707
708 hl_device_reset(hdev, flags);
709 }
710
711 static const struct dev_pm_ops hl_pm_ops = {
712 .suspend = hl_pmops_suspend,
713 .resume = hl_pmops_resume,
714 };
715
716 static const struct pci_error_handlers hl_pci_err_handler = {
717 .error_detected = hl_pci_err_detected,
718 .slot_reset = hl_pci_err_slot_reset,
719 .resume = hl_pci_err_resume,
720 .reset_prepare = hl_pci_reset_prepare,
721 .reset_done = hl_pci_reset_done,
722 };
723
724 static struct pci_driver hl_pci_driver = {
725 .name = HL_NAME,
726 .id_table = ids,
727 .probe = hl_pci_probe,
728 .remove = hl_pci_remove,
729 .shutdown = hl_pci_remove,
730 .driver = {
731 .name = HL_NAME,
732 .pm = &hl_pm_ops,
733 .probe_type = PROBE_PREFER_ASYNCHRONOUS,
734 },
735 .err_handler = &hl_pci_err_handler,
736 };
737
738 /*
739 * hl_init - Initialize the habanalabs kernel driver
740 */
hl_init(void)741 static int __init hl_init(void)
742 {
743 int rc;
744 dev_t dev;
745
746 pr_info("loading driver\n");
747
748 rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
749 if (rc < 0) {
750 pr_err("unable to get major\n");
751 return rc;
752 }
753
754 hl_major = MAJOR(dev);
755
756 rc = pci_register_driver(&hl_pci_driver);
757 if (rc) {
758 pr_err("failed to register pci device\n");
759 goto remove_major;
760 }
761
762 pr_debug("driver loaded\n");
763
764 return 0;
765
766 remove_major:
767 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
768 return rc;
769 }
770
771 /*
772 * hl_exit - Release all resources of the habanalabs kernel driver
773 */
hl_exit(void)774 static void __exit hl_exit(void)
775 {
776 pci_unregister_driver(&hl_pci_driver);
777
778 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
779
780 idr_destroy(&hl_devs_idr);
781
782 pr_debug("driver removed\n");
783 }
784
785 module_init(hl_init);
786 module_exit(hl_exit);
787