xref: /openbsd-src/sys/dev/pci/drm/scheduler/sched_main.c (revision 1ad61ae0a79a724d2d3ec69e69c8e1d1ff6b53a0)
1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 /**
25  * DOC: Overview
26  *
27  * The GPU scheduler provides entities which allow userspace to push jobs
28  * into software queues which are then scheduled on a hardware run queue.
29  * The software queues have a priority among them. The scheduler selects the entities
30  * from the run queue using a FIFO. The scheduler provides dependency handling
31  * features among jobs. The driver is supposed to provide callback functions for
32  * backend operations to the scheduler like submitting a job to hardware run queue,
33  * returning the dependencies of a job etc.
34  *
35  * The organisation of the scheduler is the following:
36  *
37  * 1. Each hw run queue has one scheduler
38  * 2. Each scheduler has multiple run queues with different priorities
39  *    (e.g., HIGH_HW,HIGH_SW, KERNEL, NORMAL)
40  * 3. Each scheduler run queue has a queue of entities to schedule
41  * 4. Entities themselves maintain a queue of jobs that will be scheduled on
42  *    the hardware.
43  *
44  * The jobs in a entity are always scheduled in the order that they were pushed.
45  */
46 
47 #include <linux/kthread.h>
48 #include <linux/wait.h>
49 #include <linux/sched.h>
50 #include <linux/completion.h>
51 #include <linux/dma-resv.h>
52 #ifdef __linux__
53 #include <uapi/linux/sched/types.h>
54 #endif
55 
56 #include <drm/drm_print.h>
57 #include <drm/drm_gem.h>
58 #include <drm/gpu_scheduler.h>
59 #include <drm/spsc_queue.h>
60 
61 #define CREATE_TRACE_POINTS
62 #include "gpu_scheduler_trace.h"
63 
64 #define to_drm_sched_job(sched_job)		\
65 		container_of((sched_job), struct drm_sched_job, queue_node)
66 
67 /**
68  * drm_sched_rq_init - initialize a given run queue struct
69  *
70  * @sched: scheduler instance to associate with this run queue
71  * @rq: scheduler run queue
72  *
73  * Initializes a scheduler runqueue.
74  */
75 static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
76 			      struct drm_sched_rq *rq)
77 {
78 	mtx_init(&rq->lock, IPL_NONE);
79 	INIT_LIST_HEAD(&rq->entities);
80 	rq->current_entity = NULL;
81 	rq->sched = sched;
82 }
83 
84 /**
85  * drm_sched_rq_add_entity - add an entity
86  *
87  * @rq: scheduler run queue
88  * @entity: scheduler entity
89  *
90  * Adds a scheduler entity to the run queue.
91  */
92 void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
93 			     struct drm_sched_entity *entity)
94 {
95 	if (!list_empty(&entity->list))
96 		return;
97 	spin_lock(&rq->lock);
98 	atomic_inc(rq->sched->score);
99 	list_add_tail(&entity->list, &rq->entities);
100 	spin_unlock(&rq->lock);
101 }
102 
103 /**
104  * drm_sched_rq_remove_entity - remove an entity
105  *
106  * @rq: scheduler run queue
107  * @entity: scheduler entity
108  *
109  * Removes a scheduler entity from the run queue.
110  */
111 void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
112 				struct drm_sched_entity *entity)
113 {
114 	if (list_empty(&entity->list))
115 		return;
116 	spin_lock(&rq->lock);
117 	atomic_dec(rq->sched->score);
118 	list_del_init(&entity->list);
119 	if (rq->current_entity == entity)
120 		rq->current_entity = NULL;
121 	spin_unlock(&rq->lock);
122 }
123 
124 /**
125  * drm_sched_rq_select_entity - Select an entity which could provide a job to run
126  *
127  * @rq: scheduler run queue to check.
128  *
129  * Try to find a ready entity, returns NULL if none found.
130  */
131 static struct drm_sched_entity *
132 drm_sched_rq_select_entity(struct drm_sched_rq *rq)
133 {
134 	struct drm_sched_entity *entity;
135 
136 	spin_lock(&rq->lock);
137 
138 	entity = rq->current_entity;
139 	if (entity) {
140 		list_for_each_entry_continue(entity, &rq->entities, list) {
141 			if (drm_sched_entity_is_ready(entity)) {
142 				rq->current_entity = entity;
143 				reinit_completion(&entity->entity_idle);
144 				spin_unlock(&rq->lock);
145 				return entity;
146 			}
147 		}
148 	}
149 
150 	list_for_each_entry(entity, &rq->entities, list) {
151 
152 		if (drm_sched_entity_is_ready(entity)) {
153 			rq->current_entity = entity;
154 			reinit_completion(&entity->entity_idle);
155 			spin_unlock(&rq->lock);
156 			return entity;
157 		}
158 
159 		if (entity == rq->current_entity)
160 			break;
161 	}
162 
163 	spin_unlock(&rq->lock);
164 
165 	return NULL;
166 }
167 
168 /**
169  * drm_sched_job_done - complete a job
170  * @s_job: pointer to the job which is done
171  *
172  * Finish the job's fence and wake up the worker thread.
173  */
174 static void drm_sched_job_done(struct drm_sched_job *s_job)
175 {
176 	struct drm_sched_fence *s_fence = s_job->s_fence;
177 	struct drm_gpu_scheduler *sched = s_fence->sched;
178 
179 	atomic_dec(&sched->hw_rq_count);
180 	atomic_dec(sched->score);
181 
182 	trace_drm_sched_process_job(s_fence);
183 
184 	dma_fence_get(&s_fence->finished);
185 	drm_sched_fence_finished(s_fence);
186 	dma_fence_put(&s_fence->finished);
187 	wake_up_interruptible(&sched->wake_up_worker);
188 }
189 
190 /**
191  * drm_sched_job_done_cb - the callback for a done job
192  * @f: fence
193  * @cb: fence callbacks
194  */
195 static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
196 {
197 	struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
198 
199 	drm_sched_job_done(s_job);
200 }
201 
202 /**
203  * drm_sched_dependency_optimized - test if the dependency can be optimized
204  *
205  * @fence: the dependency fence
206  * @entity: the entity which depends on the above fence
207  *
208  * Returns true if the dependency can be optimized and false otherwise
209  */
210 bool drm_sched_dependency_optimized(struct dma_fence* fence,
211 				    struct drm_sched_entity *entity)
212 {
213 	struct drm_gpu_scheduler *sched = entity->rq->sched;
214 	struct drm_sched_fence *s_fence;
215 
216 	if (!fence || dma_fence_is_signaled(fence))
217 		return false;
218 	if (fence->context == entity->fence_context)
219 		return true;
220 	s_fence = to_drm_sched_fence(fence);
221 	if (s_fence && s_fence->sched == sched)
222 		return true;
223 
224 	return false;
225 }
226 EXPORT_SYMBOL(drm_sched_dependency_optimized);
227 
228 /**
229  * drm_sched_start_timeout - start timeout for reset worker
230  *
231  * @sched: scheduler instance to start the worker for
232  *
233  * Start the timeout for the given scheduler.
234  */
235 static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
236 {
237 	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
238 	    !list_empty(&sched->pending_list))
239 		queue_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
240 }
241 
242 /**
243  * drm_sched_fault - immediately start timeout handler
244  *
245  * @sched: scheduler where the timeout handling should be started.
246  *
247  * Start timeout handling immediately when the driver detects a hardware fault.
248  */
249 void drm_sched_fault(struct drm_gpu_scheduler *sched)
250 {
251 	mod_delayed_work(sched->timeout_wq, &sched->work_tdr, 0);
252 }
253 EXPORT_SYMBOL(drm_sched_fault);
254 
255 /**
256  * drm_sched_suspend_timeout - Suspend scheduler job timeout
257  *
258  * @sched: scheduler instance for which to suspend the timeout
259  *
260  * Suspend the delayed work timeout for the scheduler. This is done by
261  * modifying the delayed work timeout to an arbitrary large value,
262  * MAX_SCHEDULE_TIMEOUT in this case.
263  *
264  * Returns the timeout remaining
265  *
266  */
267 unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
268 {
269 	unsigned long sched_timeout, now = jiffies;
270 
271 #ifdef __linux__
272 	sched_timeout = sched->work_tdr.timer.expires;
273 #else
274 	sched_timeout = sched->work_tdr.to.to_time;
275 #endif
276 
277 	/*
278 	 * Modify the timeout to an arbitrarily large value. This also prevents
279 	 * the timeout to be restarted when new submissions arrive
280 	 */
281 	if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
282 			&& time_after(sched_timeout, now))
283 		return sched_timeout - now;
284 	else
285 		return sched->timeout;
286 }
287 EXPORT_SYMBOL(drm_sched_suspend_timeout);
288 
289 /**
290  * drm_sched_resume_timeout - Resume scheduler job timeout
291  *
292  * @sched: scheduler instance for which to resume the timeout
293  * @remaining: remaining timeout
294  *
295  * Resume the delayed work timeout for the scheduler.
296  */
297 void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
298 		unsigned long remaining)
299 {
300 	spin_lock(&sched->job_list_lock);
301 
302 	if (list_empty(&sched->pending_list))
303 		cancel_delayed_work(&sched->work_tdr);
304 	else
305 		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, remaining);
306 
307 	spin_unlock(&sched->job_list_lock);
308 }
309 EXPORT_SYMBOL(drm_sched_resume_timeout);
310 
311 static void drm_sched_job_begin(struct drm_sched_job *s_job)
312 {
313 	struct drm_gpu_scheduler *sched = s_job->sched;
314 
315 	spin_lock(&sched->job_list_lock);
316 	list_add_tail(&s_job->list, &sched->pending_list);
317 	drm_sched_start_timeout(sched);
318 	spin_unlock(&sched->job_list_lock);
319 }
320 
321 static void drm_sched_job_timedout(struct work_struct *work)
322 {
323 	struct drm_gpu_scheduler *sched;
324 	struct drm_sched_job *job;
325 	enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;
326 
327 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
328 
329 	/* Protects against concurrent deletion in drm_sched_get_cleanup_job */
330 	spin_lock(&sched->job_list_lock);
331 	job = list_first_entry_or_null(&sched->pending_list,
332 				       struct drm_sched_job, list);
333 
334 	if (job) {
335 		/*
336 		 * Remove the bad job so it cannot be freed by concurrent
337 		 * drm_sched_cleanup_jobs. It will be reinserted back after sched->thread
338 		 * is parked at which point it's safe.
339 		 */
340 		list_del_init(&job->list);
341 		spin_unlock(&sched->job_list_lock);
342 
343 		status = job->sched->ops->timedout_job(job);
344 
345 		/*
346 		 * Guilty job did complete and hence needs to be manually removed
347 		 * See drm_sched_stop doc.
348 		 */
349 		if (sched->free_guilty) {
350 			job->sched->ops->free_job(job);
351 			sched->free_guilty = false;
352 		}
353 	} else {
354 		spin_unlock(&sched->job_list_lock);
355 	}
356 
357 	if (status != DRM_GPU_SCHED_STAT_ENODEV) {
358 		spin_lock(&sched->job_list_lock);
359 		drm_sched_start_timeout(sched);
360 		spin_unlock(&sched->job_list_lock);
361 	}
362 }
363 
364  /**
365   * drm_sched_increase_karma - Update sched_entity guilty flag
366   *
367   * @bad: The job guilty of time out
368   *
369   * Increment on every hang caused by the 'bad' job. If this exceeds the hang
370   * limit of the scheduler then the respective sched entity is marked guilty and
371   * jobs from it will not be scheduled further
372   */
373 void drm_sched_increase_karma(struct drm_sched_job *bad)
374 {
375 	drm_sched_increase_karma_ext(bad, 1);
376 }
377 EXPORT_SYMBOL(drm_sched_increase_karma);
378 
379 void drm_sched_reset_karma(struct drm_sched_job *bad)
380 {
381 	drm_sched_increase_karma_ext(bad, 0);
382 }
383 EXPORT_SYMBOL(drm_sched_reset_karma);
384 
385 /**
386  * drm_sched_stop - stop the scheduler
387  *
388  * @sched: scheduler instance
389  * @bad: job which caused the time out
390  *
391  * Stop the scheduler and also removes and frees all completed jobs.
392  * Note: bad job will not be freed as it might be used later and so it's
393  * callers responsibility to release it manually if it's not part of the
394  * pending list any more.
395  *
396  */
397 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
398 {
399 	struct drm_sched_job *s_job, *tmp;
400 
401 	kthread_park(sched->thread);
402 
403 	/*
404 	 * Reinsert back the bad job here - now it's safe as
405 	 * drm_sched_get_cleanup_job cannot race against us and release the
406 	 * bad job at this point - we parked (waited for) any in progress
407 	 * (earlier) cleanups and drm_sched_get_cleanup_job will not be called
408 	 * now until the scheduler thread is unparked.
409 	 */
410 	if (bad && bad->sched == sched)
411 		/*
412 		 * Add at the head of the queue to reflect it was the earliest
413 		 * job extracted.
414 		 */
415 		list_add(&bad->list, &sched->pending_list);
416 
417 	/*
418 	 * Iterate the job list from later to  earlier one and either deactive
419 	 * their HW callbacks or remove them from pending list if they already
420 	 * signaled.
421 	 * This iteration is thread safe as sched thread is stopped.
422 	 */
423 	list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list,
424 					 list) {
425 		if (s_job->s_fence->parent &&
426 		    dma_fence_remove_callback(s_job->s_fence->parent,
427 					      &s_job->cb)) {
428 			dma_fence_put(s_job->s_fence->parent);
429 			s_job->s_fence->parent = NULL;
430 			atomic_dec(&sched->hw_rq_count);
431 		} else {
432 			/*
433 			 * remove job from pending_list.
434 			 * Locking here is for concurrent resume timeout
435 			 */
436 			spin_lock(&sched->job_list_lock);
437 			list_del_init(&s_job->list);
438 			spin_unlock(&sched->job_list_lock);
439 
440 			/*
441 			 * Wait for job's HW fence callback to finish using s_job
442 			 * before releasing it.
443 			 *
444 			 * Job is still alive so fence refcount at least 1
445 			 */
446 			dma_fence_wait(&s_job->s_fence->finished, false);
447 
448 			/*
449 			 * We must keep bad job alive for later use during
450 			 * recovery by some of the drivers but leave a hint
451 			 * that the guilty job must be released.
452 			 */
453 			if (bad != s_job)
454 				sched->ops->free_job(s_job);
455 			else
456 				sched->free_guilty = true;
457 		}
458 	}
459 
460 	/*
461 	 * Stop pending timer in flight as we rearm it in  drm_sched_start. This
462 	 * avoids the pending timeout work in progress to fire right away after
463 	 * this TDR finished and before the newly restarted jobs had a
464 	 * chance to complete.
465 	 */
466 	cancel_delayed_work(&sched->work_tdr);
467 }
468 
469 EXPORT_SYMBOL(drm_sched_stop);
470 
471 /**
472  * drm_sched_start - recover jobs after a reset
473  *
474  * @sched: scheduler instance
475  * @full_recovery: proceed with complete sched restart
476  *
477  */
478 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
479 {
480 	struct drm_sched_job *s_job, *tmp;
481 	int r;
482 
483 	/*
484 	 * Locking the list is not required here as the sched thread is parked
485 	 * so no new jobs are being inserted or removed. Also concurrent
486 	 * GPU recovers can't run in parallel.
487 	 */
488 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
489 		struct dma_fence *fence = s_job->s_fence->parent;
490 
491 		atomic_inc(&sched->hw_rq_count);
492 
493 		if (!full_recovery)
494 			continue;
495 
496 		if (fence) {
497 			r = dma_fence_add_callback(fence, &s_job->cb,
498 						   drm_sched_job_done_cb);
499 			if (r == -ENOENT)
500 				drm_sched_job_done(s_job);
501 			else if (r)
502 				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
503 					  r);
504 		} else
505 			drm_sched_job_done(s_job);
506 	}
507 
508 	if (full_recovery) {
509 		spin_lock(&sched->job_list_lock);
510 		drm_sched_start_timeout(sched);
511 		spin_unlock(&sched->job_list_lock);
512 	}
513 
514 	kthread_unpark(sched->thread);
515 }
516 EXPORT_SYMBOL(drm_sched_start);
517 
518 /**
519  * drm_sched_resubmit_jobs - helper to relaunch jobs from the pending list
520  *
521  * @sched: scheduler instance
522  *
523  */
524 void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
525 {
526 	drm_sched_resubmit_jobs_ext(sched, INT_MAX);
527 }
528 EXPORT_SYMBOL(drm_sched_resubmit_jobs);
529 
530 /**
531  * drm_sched_resubmit_jobs_ext - helper to relunch certain number of jobs from mirror ring list
532  *
533  * @sched: scheduler instance
534  * @max: job numbers to relaunch
535  *
536  */
537 void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
538 {
539 	struct drm_sched_job *s_job, *tmp;
540 	uint64_t guilty_context;
541 	bool found_guilty = false;
542 	struct dma_fence *fence;
543 	int i = 0;
544 
545 	list_for_each_entry_safe(s_job, tmp, &sched->pending_list, list) {
546 		struct drm_sched_fence *s_fence = s_job->s_fence;
547 
548 		if (i >= max)
549 			break;
550 
551 		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
552 			found_guilty = true;
553 			guilty_context = s_job->s_fence->scheduled.context;
554 		}
555 
556 		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
557 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
558 
559 		fence = sched->ops->run_job(s_job);
560 		i++;
561 
562 		if (IS_ERR_OR_NULL(fence)) {
563 			if (IS_ERR(fence))
564 				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
565 
566 			s_job->s_fence->parent = NULL;
567 		} else {
568 
569 			s_job->s_fence->parent = dma_fence_get(fence);
570 
571 			/* Drop for orignal kref_init */
572 			dma_fence_put(fence);
573 		}
574 	}
575 }
576 EXPORT_SYMBOL(drm_sched_resubmit_jobs_ext);
577 
578 /**
579  * drm_sched_job_init - init a scheduler job
580  * @job: scheduler job to init
581  * @entity: scheduler entity to use
582  * @owner: job owner for debugging
583  *
584  * Refer to drm_sched_entity_push_job() documentation
585  * for locking considerations.
586  *
587  * Drivers must make sure drm_sched_job_cleanup() if this function returns
588  * successfully, even when @job is aborted before drm_sched_job_arm() is called.
589  *
590  * WARNING: amdgpu abuses &drm_sched.ready to signal when the hardware
591  * has died, which can mean that there's no valid runqueue for a @entity.
592  * This function returns -ENOENT in this case (which probably should be -EIO as
593  * a more meanigful return value).
594  *
595  * Returns 0 for success, negative error code otherwise.
596  */
597 int drm_sched_job_init(struct drm_sched_job *job,
598 		       struct drm_sched_entity *entity,
599 		       void *owner)
600 {
601 	if (!entity->rq)
602 		return -ENOENT;
603 
604 	job->entity = entity;
605 	job->s_fence = drm_sched_fence_alloc(entity, owner);
606 	if (!job->s_fence)
607 		return -ENOMEM;
608 
609 	INIT_LIST_HEAD(&job->list);
610 
611 	xa_init_flags(&job->dependencies, XA_FLAGS_ALLOC);
612 
613 	return 0;
614 }
615 EXPORT_SYMBOL(drm_sched_job_init);
616 
617 /**
618  * drm_sched_job_arm - arm a scheduler job for execution
619  * @job: scheduler job to arm
620  *
621  * This arms a scheduler job for execution. Specifically it initializes the
622  * &drm_sched_job.s_fence of @job, so that it can be attached to struct dma_resv
623  * or other places that need to track the completion of this job.
624  *
625  * Refer to drm_sched_entity_push_job() documentation for locking
626  * considerations.
627  *
628  * This can only be called if drm_sched_job_init() succeeded.
629  */
630 void drm_sched_job_arm(struct drm_sched_job *job)
631 {
632 	struct drm_gpu_scheduler *sched;
633 	struct drm_sched_entity *entity = job->entity;
634 
635 	BUG_ON(!entity);
636 	drm_sched_entity_select_rq(entity);
637 	sched = entity->rq->sched;
638 
639 	job->sched = sched;
640 	job->s_priority = entity->rq - sched->sched_rq;
641 	job->id = atomic64_inc_return(&sched->job_id_count);
642 
643 	drm_sched_fence_init(job->s_fence, job->entity);
644 }
645 EXPORT_SYMBOL(drm_sched_job_arm);
646 
647 /**
648  * drm_sched_job_add_dependency - adds the fence as a job dependency
649  * @job: scheduler job to add the dependencies to
650  * @fence: the dma_fence to add to the list of dependencies.
651  *
652  * Note that @fence is consumed in both the success and error cases.
653  *
654  * Returns:
655  * 0 on success, or an error on failing to expand the array.
656  */
657 int drm_sched_job_add_dependency(struct drm_sched_job *job,
658 				 struct dma_fence *fence)
659 {
660 	struct dma_fence *entry;
661 	unsigned long index;
662 	u32 id = 0;
663 	int ret;
664 
665 	if (!fence)
666 		return 0;
667 
668 	/* Deduplicate if we already depend on a fence from the same context.
669 	 * This lets the size of the array of deps scale with the number of
670 	 * engines involved, rather than the number of BOs.
671 	 */
672 	xa_for_each(&job->dependencies, index, entry) {
673 		if (entry->context != fence->context)
674 			continue;
675 
676 		if (dma_fence_is_later(fence, entry)) {
677 			dma_fence_put(entry);
678 			xa_store(&job->dependencies, index, fence, GFP_KERNEL);
679 		} else {
680 			dma_fence_put(fence);
681 		}
682 		return 0;
683 	}
684 
685 	ret = xa_alloc(&job->dependencies, &id, fence, xa_limit_32b, GFP_KERNEL);
686 	if (ret != 0)
687 		dma_fence_put(fence);
688 
689 	return ret;
690 }
691 EXPORT_SYMBOL(drm_sched_job_add_dependency);
692 
693 /**
694  * drm_sched_job_add_implicit_dependencies - adds implicit dependencies as job
695  *   dependencies
696  * @job: scheduler job to add the dependencies to
697  * @obj: the gem object to add new dependencies from.
698  * @write: whether the job might write the object (so we need to depend on
699  * shared fences in the reservation object).
700  *
701  * This should be called after drm_gem_lock_reservations() on your array of
702  * GEM objects used in the job but before updating the reservations with your
703  * own fences.
704  *
705  * Returns:
706  * 0 on success, or an error on failing to expand the array.
707  */
708 int drm_sched_job_add_implicit_dependencies(struct drm_sched_job *job,
709 					    struct drm_gem_object *obj,
710 					    bool write)
711 {
712 	struct dma_resv_iter cursor;
713 	struct dma_fence *fence;
714 	int ret;
715 
716 	dma_resv_assert_held(obj->resv);
717 
718 	dma_resv_for_each_fence(&cursor, obj->resv, dma_resv_usage_rw(write),
719 				fence) {
720 		/* Make sure to grab an additional ref on the added fence */
721 		dma_fence_get(fence);
722 		ret = drm_sched_job_add_dependency(job, fence);
723 		if (ret) {
724 			dma_fence_put(fence);
725 			return ret;
726 		}
727 	}
728 	return 0;
729 }
730 EXPORT_SYMBOL(drm_sched_job_add_implicit_dependencies);
731 
732 
733 /**
734  * drm_sched_job_cleanup - clean up scheduler job resources
735  * @job: scheduler job to clean up
736  *
737  * Cleans up the resources allocated with drm_sched_job_init().
738  *
739  * Drivers should call this from their error unwind code if @job is aborted
740  * before drm_sched_job_arm() is called.
741  *
742  * After that point of no return @job is committed to be executed by the
743  * scheduler, and this function should be called from the
744  * &drm_sched_backend_ops.free_job callback.
745  */
746 void drm_sched_job_cleanup(struct drm_sched_job *job)
747 {
748 	struct dma_fence *fence;
749 	unsigned long index;
750 
751 	if (kref_read(&job->s_fence->finished.refcount)) {
752 		/* drm_sched_job_arm() has been called */
753 		dma_fence_put(&job->s_fence->finished);
754 	} else {
755 		/* aborted job before committing to run it */
756 		drm_sched_fence_free(job->s_fence);
757 	}
758 
759 	job->s_fence = NULL;
760 
761 	xa_for_each(&job->dependencies, index, fence) {
762 		dma_fence_put(fence);
763 	}
764 	xa_destroy(&job->dependencies);
765 
766 }
767 EXPORT_SYMBOL(drm_sched_job_cleanup);
768 
769 /**
770  * drm_sched_ready - is the scheduler ready
771  *
772  * @sched: scheduler instance
773  *
774  * Return true if we can push more jobs to the hw, otherwise false.
775  */
776 static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
777 {
778 	return atomic_read(&sched->hw_rq_count) <
779 		sched->hw_submission_limit;
780 }
781 
782 /**
783  * drm_sched_wakeup - Wake up the scheduler when it is ready
784  *
785  * @sched: scheduler instance
786  *
787  */
788 void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
789 {
790 	if (drm_sched_ready(sched))
791 		wake_up_interruptible(&sched->wake_up_worker);
792 }
793 
794 /**
795  * drm_sched_select_entity - Select next entity to process
796  *
797  * @sched: scheduler instance
798  *
799  * Returns the entity to process or NULL if none are found.
800  */
801 static struct drm_sched_entity *
802 drm_sched_select_entity(struct drm_gpu_scheduler *sched)
803 {
804 	struct drm_sched_entity *entity;
805 	int i;
806 
807 	if (!drm_sched_ready(sched))
808 		return NULL;
809 
810 	/* Kernel run queue has higher priority than normal run queue*/
811 	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
812 		entity = drm_sched_rq_select_entity(&sched->sched_rq[i]);
813 		if (entity)
814 			break;
815 	}
816 
817 	return entity;
818 }
819 
820 /**
821  * drm_sched_get_cleanup_job - fetch the next finished job to be destroyed
822  *
823  * @sched: scheduler instance
824  *
825  * Returns the next finished job from the pending list (if there is one)
826  * ready for it to be destroyed.
827  */
828 static struct drm_sched_job *
829 drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
830 {
831 	struct drm_sched_job *job, *next;
832 
833 	spin_lock(&sched->job_list_lock);
834 
835 	job = list_first_entry_or_null(&sched->pending_list,
836 				       struct drm_sched_job, list);
837 
838 	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
839 		/* remove job from pending_list */
840 		list_del_init(&job->list);
841 
842 		/* cancel this job's TO timer */
843 		cancel_delayed_work(&sched->work_tdr);
844 		/* make the scheduled timestamp more accurate */
845 		next = list_first_entry_or_null(&sched->pending_list,
846 						typeof(*next), list);
847 
848 		if (next) {
849 			next->s_fence->scheduled.timestamp =
850 				dma_fence_timestamp(&job->s_fence->finished);
851 			/* start TO timer for next job */
852 			drm_sched_start_timeout(sched);
853 		}
854 	} else {
855 		job = NULL;
856 	}
857 
858 	spin_unlock(&sched->job_list_lock);
859 
860 	return job;
861 }
862 
863 /**
864  * drm_sched_pick_best - Get a drm sched from a sched_list with the least load
865  * @sched_list: list of drm_gpu_schedulers
866  * @num_sched_list: number of drm_gpu_schedulers in the sched_list
867  *
868  * Returns pointer of the sched with the least load or NULL if none of the
869  * drm_gpu_schedulers are ready
870  */
871 struct drm_gpu_scheduler *
872 drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
873 		     unsigned int num_sched_list)
874 {
875 	struct drm_gpu_scheduler *sched, *picked_sched = NULL;
876 	int i;
877 	unsigned int min_score = UINT_MAX, num_score;
878 
879 	for (i = 0; i < num_sched_list; ++i) {
880 		sched = sched_list[i];
881 
882 		if (!sched->ready) {
883 			DRM_WARN("scheduler %s is not ready, skipping",
884 				 sched->name);
885 			continue;
886 		}
887 
888 		num_score = atomic_read(sched->score);
889 		if (num_score < min_score) {
890 			min_score = num_score;
891 			picked_sched = sched;
892 		}
893 	}
894 
895 	return picked_sched;
896 }
897 EXPORT_SYMBOL(drm_sched_pick_best);
898 
899 /**
900  * drm_sched_blocked - check if the scheduler is blocked
901  *
902  * @sched: scheduler instance
903  *
904  * Returns true if blocked, otherwise false.
905  */
906 static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
907 {
908 	if (kthread_should_park()) {
909 		kthread_parkme();
910 		return true;
911 	}
912 
913 	return false;
914 }
915 
916 /**
917  * drm_sched_main - main scheduler thread
918  *
919  * @param: scheduler instance
920  *
921  * Returns 0.
922  */
923 static int drm_sched_main(void *param)
924 {
925 	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
926 	int r;
927 
928 #ifdef __linux__
929 	sched_set_fifo_low(current);
930 #endif
931 
932 	while (!kthread_should_stop()) {
933 		struct drm_sched_entity *entity = NULL;
934 		struct drm_sched_fence *s_fence;
935 		struct drm_sched_job *sched_job;
936 		struct dma_fence *fence;
937 		struct drm_sched_job *cleanup_job = NULL;
938 
939 		wait_event_interruptible(sched->wake_up_worker,
940 					 (cleanup_job = drm_sched_get_cleanup_job(sched)) ||
941 					 (!drm_sched_blocked(sched) &&
942 					  (entity = drm_sched_select_entity(sched))) ||
943 					 kthread_should_stop());
944 
945 		if (cleanup_job)
946 			sched->ops->free_job(cleanup_job);
947 
948 		if (!entity)
949 			continue;
950 
951 		sched_job = drm_sched_entity_pop_job(entity);
952 
953 		if (!sched_job) {
954 			complete(&entity->entity_idle);
955 			continue;
956 		}
957 
958 		s_fence = sched_job->s_fence;
959 
960 		atomic_inc(&sched->hw_rq_count);
961 		drm_sched_job_begin(sched_job);
962 
963 		trace_drm_run_job(sched_job, entity);
964 		fence = sched->ops->run_job(sched_job);
965 		complete(&entity->entity_idle);
966 		drm_sched_fence_scheduled(s_fence);
967 
968 		if (!IS_ERR_OR_NULL(fence)) {
969 			s_fence->parent = dma_fence_get(fence);
970 			/* Drop for original kref_init of the fence */
971 			dma_fence_put(fence);
972 
973 			r = dma_fence_add_callback(fence, &sched_job->cb,
974 						   drm_sched_job_done_cb);
975 			if (r == -ENOENT)
976 				drm_sched_job_done(sched_job);
977 			else if (r)
978 				DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
979 					  r);
980 		} else {
981 			if (IS_ERR(fence))
982 				dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
983 
984 			drm_sched_job_done(sched_job);
985 		}
986 
987 		wake_up(&sched->job_scheduled);
988 	}
989 	return 0;
990 }
991 
992 /**
993  * drm_sched_init - Init a gpu scheduler instance
994  *
995  * @sched: scheduler instance
996  * @ops: backend operations for this scheduler
997  * @hw_submission: number of hw submissions that can be in flight
998  * @hang_limit: number of times to allow a job to hang before dropping it
999  * @timeout: timeout value in jiffies for the scheduler
1000  * @timeout_wq: workqueue to use for timeout work. If NULL, the system_wq is
1001  *		used
1002  * @score: optional score atomic shared with other schedulers
1003  * @name: name used for debugging
1004  * @dev: target &struct device
1005  *
1006  * Return 0 on success, otherwise error code.
1007  */
1008 int drm_sched_init(struct drm_gpu_scheduler *sched,
1009 		   const struct drm_sched_backend_ops *ops,
1010 		   unsigned hw_submission, unsigned hang_limit,
1011 		   long timeout, struct workqueue_struct *timeout_wq,
1012 		   atomic_t *score, const char *name, struct device *dev)
1013 {
1014 	int i, ret;
1015 	sched->ops = ops;
1016 	sched->hw_submission_limit = hw_submission;
1017 	sched->name = name;
1018 	sched->timeout = timeout;
1019 	sched->timeout_wq = timeout_wq ? : system_wq;
1020 	sched->hang_limit = hang_limit;
1021 	sched->score = score ? score : &sched->_score;
1022 	sched->dev = dev;
1023 	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_COUNT; i++)
1024 		drm_sched_rq_init(sched, &sched->sched_rq[i]);
1025 
1026 	init_waitqueue_head(&sched->wake_up_worker);
1027 	init_waitqueue_head(&sched->job_scheduled);
1028 	INIT_LIST_HEAD(&sched->pending_list);
1029 	mtx_init(&sched->job_list_lock, IPL_NONE);
1030 	atomic_set(&sched->hw_rq_count, 0);
1031 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
1032 	atomic_set(&sched->_score, 0);
1033 	atomic64_set(&sched->job_id_count, 0);
1034 
1035 	/* Each scheduler will run on a seperate kernel thread */
1036 	sched->thread = kthread_run(drm_sched_main, sched, sched->name);
1037 	if (IS_ERR(sched->thread)) {
1038 		ret = PTR_ERR(sched->thread);
1039 		sched->thread = NULL;
1040 		DRM_DEV_ERROR(sched->dev, "Failed to create scheduler for %s.\n", name);
1041 		return ret;
1042 	}
1043 
1044 	sched->ready = true;
1045 	return 0;
1046 }
1047 EXPORT_SYMBOL(drm_sched_init);
1048 
1049 /**
1050  * drm_sched_fini - Destroy a gpu scheduler
1051  *
1052  * @sched: scheduler instance
1053  *
1054  * Tears down and cleans up the scheduler.
1055  */
1056 void drm_sched_fini(struct drm_gpu_scheduler *sched)
1057 {
1058 	struct drm_sched_entity *s_entity;
1059 	int i;
1060 
1061 	if (sched->thread)
1062 		kthread_stop(sched->thread);
1063 
1064 	for (i = DRM_SCHED_PRIORITY_COUNT - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
1065 		struct drm_sched_rq *rq = &sched->sched_rq[i];
1066 
1067 		if (!rq)
1068 			continue;
1069 
1070 		spin_lock(&rq->lock);
1071 		list_for_each_entry(s_entity, &rq->entities, list)
1072 			/*
1073 			 * Prevents reinsertion and marks job_queue as idle,
1074 			 * it will removed from rq in drm_sched_entity_fini
1075 			 * eventually
1076 			 */
1077 			s_entity->stopped = true;
1078 		spin_unlock(&rq->lock);
1079 
1080 	}
1081 
1082 	/* Wakeup everyone stuck in drm_sched_entity_flush for this scheduler */
1083 	wake_up_all(&sched->job_scheduled);
1084 
1085 	/* Confirm no work left behind accessing device structures */
1086 	cancel_delayed_work_sync(&sched->work_tdr);
1087 
1088 	sched->ready = false;
1089 }
1090 EXPORT_SYMBOL(drm_sched_fini);
1091 
1092 /**
1093  * drm_sched_increase_karma_ext - Update sched_entity guilty flag
1094  *
1095  * @bad: The job guilty of time out
1096  * @type: type for increase/reset karma
1097  *
1098  */
1099 void drm_sched_increase_karma_ext(struct drm_sched_job *bad, int type)
1100 {
1101 	int i;
1102 	struct drm_sched_entity *tmp;
1103 	struct drm_sched_entity *entity;
1104 	struct drm_gpu_scheduler *sched = bad->sched;
1105 
1106 	/* don't change @bad's karma if it's from KERNEL RQ,
1107 	 * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
1108 	 * corrupt but keep in mind that kernel jobs always considered good.
1109 	 */
1110 	if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
1111 		if (type == 0)
1112 			atomic_set(&bad->karma, 0);
1113 		else if (type == 1)
1114 			atomic_inc(&bad->karma);
1115 
1116 		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
1117 		     i++) {
1118 			struct drm_sched_rq *rq = &sched->sched_rq[i];
1119 
1120 			spin_lock(&rq->lock);
1121 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
1122 				if (bad->s_fence->scheduled.context ==
1123 				    entity->fence_context) {
1124 					if (entity->guilty)
1125 						atomic_set(entity->guilty, type);
1126 					break;
1127 				}
1128 			}
1129 			spin_unlock(&rq->lock);
1130 			if (&entity->list != &rq->entities)
1131 				break;
1132 		}
1133 	}
1134 }
1135 EXPORT_SYMBOL(drm_sched_increase_karma_ext);
1136