xref: /dflybsd-src/sys/dev/drm/radeon/radeon_cs.c (revision d0a74117cc5baed46a1514b9ec89c0b7943c91f2)
1 /*
2  * Copyright 2008 Jerome Glisse.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22  * DEALINGS IN THE SOFTWARE.
23  *
24  * Authors:
25  *    Jerome Glisse <glisse@freedesktop.org>
26  */
27 #include <linux/list_sort.h>
28 #include <drm/drmP.h>
29 #include <drm/radeon_drm.h>
30 #include "radeon_reg.h"
31 #include "radeon.h"
32 #include "radeon_trace.h"
33 
34 #define RADEON_CS_MAX_PRIORITY		32u
35 #define RADEON_CS_NUM_BUCKETS		(RADEON_CS_MAX_PRIORITY + 1)
36 
37 /* This is based on the bucket sort with O(n) time complexity.
38  * An item with priority "i" is added to bucket[i]. The lists are then
39  * concatenated in descending order.
40  */
41 struct radeon_cs_buckets {
42 	struct list_head bucket[RADEON_CS_NUM_BUCKETS];
43 };
44 
45 static void radeon_cs_buckets_init(struct radeon_cs_buckets *b)
46 {
47 	unsigned i;
48 
49 	for (i = 0; i < RADEON_CS_NUM_BUCKETS; i++)
50 		INIT_LIST_HEAD(&b->bucket[i]);
51 }
52 
53 static void radeon_cs_buckets_add(struct radeon_cs_buckets *b,
54 				  struct list_head *item, unsigned priority)
55 {
56 	/* Since buffers which appear sooner in the relocation list are
57 	 * likely to be used more often than buffers which appear later
58 	 * in the list, the sort mustn't change the ordering of buffers
59 	 * with the same priority, i.e. it must be stable.
60 	 */
61 	list_add_tail(item, &b->bucket[min(priority, RADEON_CS_MAX_PRIORITY)]);
62 }
63 
64 static void radeon_cs_buckets_get_list(struct radeon_cs_buckets *b,
65 				       struct list_head *out_list)
66 {
67 	unsigned i;
68 
69 	/* Connect the sorted buckets in the output list. */
70 	for (i = 0; i < RADEON_CS_NUM_BUCKETS; i++) {
71 		list_splice(&b->bucket[i], out_list);
72 	}
73 }
74 
75 static int radeon_cs_parser_relocs(struct radeon_cs_parser *p)
76 {
77 	struct radeon_cs_chunk *chunk;
78 	struct radeon_cs_buckets buckets;
79 	unsigned i;
80 	int r;
81 
82 	if (p->chunk_relocs == NULL) {
83 		return 0;
84 	}
85 	chunk = p->chunk_relocs;
86 	p->dma_reloc_idx = 0;
87 	/* FIXME: we assume that each relocs use 4 dwords */
88 	p->nrelocs = chunk->length_dw / 4;
89 	p->relocs = drm_calloc_large(p->nrelocs, sizeof(struct radeon_bo_list));
90 	if (p->relocs == NULL) {
91 		return -ENOMEM;
92 	}
93 
94 	radeon_cs_buckets_init(&buckets);
95 
96 	for (i = 0; i < p->nrelocs; i++) {
97 		struct drm_radeon_cs_reloc *r;
98 		struct drm_gem_object *gobj;
99 		unsigned priority;
100 
101 		r = (struct drm_radeon_cs_reloc *)&chunk->kdata[i*4];
102 		gobj = drm_gem_object_lookup(p->filp, r->handle);
103 		if (gobj == NULL) {
104 			DRM_ERROR("gem object lookup failed 0x%x\n",
105 				  r->handle);
106 			return -ENOENT;
107 		}
108 		p->relocs[i].robj = gem_to_radeon_bo(gobj);
109 
110 		/* The userspace buffer priorities are from 0 to 15. A higher
111 		 * number means the buffer is more important.
112 		 * Also, the buffers used for write have a higher priority than
113 		 * the buffers used for read only, which doubles the range
114 		 * to 0 to 31. 32 is reserved for the kernel driver.
115 		 */
116 		priority = (r->flags & RADEON_RELOC_PRIO_MASK) * 2
117 			   + !!r->write_domain;
118 
119 		/* the first reloc of an UVD job is the msg and that must be in
120 		   VRAM, also but everything into VRAM on AGP cards and older
121 		   IGP chips to avoid image corruptions */
122 		if (p->ring == R600_RING_TYPE_UVD_INDEX &&
123 		    (i == 0 || drm_pci_device_is_agp(p->rdev->ddev) ||
124 		     p->rdev->family == CHIP_RS780 ||
125 		     p->rdev->family == CHIP_RS880)) {
126 
127 			/* TODO: is this still needed for NI+ ? */
128 			p->relocs[i].prefered_domains =
129 				RADEON_GEM_DOMAIN_VRAM;
130 
131 			p->relocs[i].allowed_domains =
132 				RADEON_GEM_DOMAIN_VRAM;
133 
134 			/* prioritize this over any other relocation */
135 			priority = RADEON_CS_MAX_PRIORITY;
136 		} else {
137 			uint32_t domain = r->write_domain ?
138 				r->write_domain : r->read_domains;
139 
140 			if (domain & RADEON_GEM_DOMAIN_CPU) {
141 				DRM_ERROR("RADEON_GEM_DOMAIN_CPU is not valid "
142 					  "for command submission\n");
143 				return -EINVAL;
144 			}
145 
146 			p->relocs[i].prefered_domains = domain;
147 			if (domain == RADEON_GEM_DOMAIN_VRAM)
148 				domain |= RADEON_GEM_DOMAIN_GTT;
149 			p->relocs[i].allowed_domains = domain;
150 		}
151 
152 #if 0
153 		if (radeon_ttm_tt_has_userptr(p->relocs[i].robj->tbo.ttm)) {
154 			uint32_t domain = p->relocs[i].prefered_domains;
155 			if (!(domain & RADEON_GEM_DOMAIN_GTT)) {
156 				DRM_ERROR("Only RADEON_GEM_DOMAIN_GTT is "
157 					  "allowed for userptr BOs\n");
158 				return -EINVAL;
159 			}
160 			need_mmap_lock = true;
161 			domain = RADEON_GEM_DOMAIN_GTT;
162 			p->relocs[i].prefered_domains = domain;
163 			p->relocs[i].allowed_domains = domain;
164 		}
165 #endif
166 
167 		p->relocs[i].tv.bo = &p->relocs[i].robj->tbo;
168 		p->relocs[i].tv.shared = !r->write_domain;
169 
170 		radeon_cs_buckets_add(&buckets, &p->relocs[i].tv.head,
171 				      priority);
172 	}
173 
174 	radeon_cs_buckets_get_list(&buckets, &p->validated);
175 
176 	if (p->cs_flags & RADEON_CS_USE_VM)
177 		p->vm_bos = radeon_vm_get_bos(p->rdev, p->ib.vm,
178 					      &p->validated);
179 #if 0
180 	if (need_mmap_lock)
181 		down_read(&current->mm->mmap_sem);
182 #endif
183 
184 	r = radeon_bo_list_validate(p->rdev, &p->ticket, &p->validated, p->ring);
185 
186 #if 0
187 	if (need_mmap_lock)
188 		up_read(&current->mm->mmap_sem);
189 #endif
190 
191 	return r;
192 }
193 
194 static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority)
195 {
196 	p->priority = priority;
197 
198 	switch (ring) {
199 	default:
200 		DRM_ERROR("unknown ring id: %d\n", ring);
201 		return -EINVAL;
202 	case RADEON_CS_RING_GFX:
203 		p->ring = RADEON_RING_TYPE_GFX_INDEX;
204 		break;
205 	case RADEON_CS_RING_COMPUTE:
206 		if (p->rdev->family >= CHIP_TAHITI) {
207 			if (p->priority > 0)
208 				p->ring = CAYMAN_RING_TYPE_CP1_INDEX;
209 			else
210 				p->ring = CAYMAN_RING_TYPE_CP2_INDEX;
211 		} else
212 			p->ring = RADEON_RING_TYPE_GFX_INDEX;
213 		break;
214 	case RADEON_CS_RING_DMA:
215 		if (p->rdev->family >= CHIP_CAYMAN) {
216 			if (p->priority > 0)
217 				p->ring = R600_RING_TYPE_DMA_INDEX;
218 			else
219 				p->ring = CAYMAN_RING_TYPE_DMA1_INDEX;
220 		} else if (p->rdev->family >= CHIP_RV770) {
221 			p->ring = R600_RING_TYPE_DMA_INDEX;
222 		} else {
223 			return -EINVAL;
224 		}
225 		break;
226 	case RADEON_CS_RING_UVD:
227 		p->ring = R600_RING_TYPE_UVD_INDEX;
228 		break;
229 	case RADEON_CS_RING_VCE:
230 		/* TODO: only use the low priority ring for now */
231 		p->ring = TN_RING_TYPE_VCE1_INDEX;
232 		break;
233 	}
234 	return 0;
235 }
236 
237 static int radeon_cs_sync_rings(struct radeon_cs_parser *p)
238 {
239 	struct radeon_bo_list *reloc;
240 	int r;
241 
242 	list_for_each_entry(reloc, &p->validated, tv.head) {
243 		struct reservation_object *resv;
244 
245 		resv = reloc->robj->tbo.resv;
246 		r = radeon_sync_resv(p->rdev, &p->ib.sync, resv,
247 				     reloc->tv.shared);
248 		if (r)
249 			return r;
250 	}
251 	return 0;
252 }
253 
254 /* XXX: note that this is called from the legacy UMS CS ioctl as well */
255 int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data)
256 {
257 	struct drm_radeon_cs *cs = data;
258 	uint64_t *chunk_array_ptr;
259 	unsigned size, i;
260 	u32 ring = RADEON_CS_RING_GFX;
261 	s32 priority = 0;
262 
263 	INIT_LIST_HEAD(&p->validated);
264 
265 	if (!cs->num_chunks) {
266 		return 0;
267 	}
268 
269 	/* get chunks */
270 	p->idx = 0;
271 	p->ib.sa_bo = NULL;
272 	p->const_ib.sa_bo = NULL;
273 	p->chunk_ib = NULL;
274 	p->chunk_relocs = NULL;
275 	p->chunk_flags = NULL;
276 	p->chunk_const_ib = NULL;
277 	p->chunks_array = kcalloc(cs->num_chunks, sizeof(uint64_t), GFP_KERNEL);
278 	if (p->chunks_array == NULL) {
279 		return -ENOMEM;
280 	}
281 	chunk_array_ptr = (uint64_t *)(unsigned long)(cs->chunks);
282 	if (copy_from_user(p->chunks_array, chunk_array_ptr,
283 			       sizeof(uint64_t)*cs->num_chunks)) {
284 		return -EFAULT;
285 	}
286 	p->cs_flags = 0;
287 	p->nchunks = cs->num_chunks;
288 	p->chunks = kcalloc(p->nchunks, sizeof(struct radeon_cs_chunk), GFP_KERNEL);
289 	if (p->chunks == NULL) {
290 		return -ENOMEM;
291 	}
292 	for (i = 0; i < p->nchunks; i++) {
293 		struct drm_radeon_cs_chunk __user **chunk_ptr = NULL;
294 		struct drm_radeon_cs_chunk user_chunk;
295 		uint32_t __user *cdata;
296 
297 		chunk_ptr = (void __user*)(unsigned long)p->chunks_array[i];
298 		if (copy_from_user(&user_chunk, chunk_ptr,
299 				       sizeof(struct drm_radeon_cs_chunk))) {
300 			return -EFAULT;
301 		}
302 		p->chunks[i].length_dw = user_chunk.length_dw;
303 		if (user_chunk.chunk_id == RADEON_CHUNK_ID_RELOCS) {
304 			p->chunk_relocs = &p->chunks[i];
305 		}
306 		if (user_chunk.chunk_id == RADEON_CHUNK_ID_IB) {
307 			p->chunk_ib = &p->chunks[i];
308 			/* zero length IB isn't useful */
309 			if (p->chunks[i].length_dw == 0)
310 				return -EINVAL;
311 		}
312 		if (user_chunk.chunk_id == RADEON_CHUNK_ID_CONST_IB) {
313 			p->chunk_const_ib = &p->chunks[i];
314 			/* zero length CONST IB isn't useful */
315 			if (p->chunks[i].length_dw == 0)
316 				return -EINVAL;
317 		}
318 		if (user_chunk.chunk_id == RADEON_CHUNK_ID_FLAGS) {
319 			p->chunk_flags = &p->chunks[i];
320 			/* zero length flags aren't useful */
321 			if (p->chunks[i].length_dw == 0)
322 				return -EINVAL;
323 		}
324 
325 		size = p->chunks[i].length_dw;
326 		cdata = (void __user *)(unsigned long)user_chunk.chunk_data;
327 		p->chunks[i].user_ptr = cdata;
328 		if (user_chunk.chunk_id == RADEON_CHUNK_ID_CONST_IB)
329 			continue;
330 
331 		if (user_chunk.chunk_id == RADEON_CHUNK_ID_IB) {
332 			if (!p->rdev || !(p->rdev->flags & RADEON_IS_AGP))
333 				continue;
334 		}
335 
336 		p->chunks[i].kdata = drm_malloc_ab(size, sizeof(uint32_t));
337 		size *= sizeof(uint32_t);
338 		if (p->chunks[i].kdata == NULL) {
339 			return -ENOMEM;
340 		}
341 		if (copy_from_user(p->chunks[i].kdata, cdata, size)) {
342 			return -EFAULT;
343 		}
344 		if (user_chunk.chunk_id == RADEON_CHUNK_ID_FLAGS) {
345 			p->cs_flags = p->chunks[i].kdata[0];
346 			if (p->chunks[i].length_dw > 1)
347 				ring = p->chunks[i].kdata[1];
348 			if (p->chunks[i].length_dw > 2)
349 				priority = (s32)p->chunks[i].kdata[2];
350 		}
351 	}
352 
353 	/* these are KMS only */
354 	if (p->rdev) {
355 		if ((p->cs_flags & RADEON_CS_USE_VM) &&
356 		    !p->rdev->vm_manager.enabled) {
357 			DRM_ERROR("VM not active on asic!\n");
358 			return -EINVAL;
359 		}
360 
361 		if (radeon_cs_get_ring(p, ring, priority))
362 			return -EINVAL;
363 
364 		/* we only support VM on some SI+ rings */
365 		if ((p->cs_flags & RADEON_CS_USE_VM) == 0) {
366 			if (p->rdev->asic->ring[p->ring]->cs_parse == NULL) {
367 				DRM_ERROR("Ring %d requires VM!\n", p->ring);
368 				return -EINVAL;
369 			}
370 		} else {
371 			if (p->rdev->asic->ring[p->ring]->ib_parse == NULL) {
372 				DRM_ERROR("VM not supported on ring %d!\n",
373 					  p->ring);
374 				return -EINVAL;
375 			}
376 		}
377 	}
378 
379 	return 0;
380 }
381 
382 static int cmp_size_smaller_first(void *priv, struct list_head *a,
383 				  struct list_head *b)
384 {
385 	struct radeon_bo_list *la = list_entry(a, struct radeon_bo_list, tv.head);
386 	struct radeon_bo_list *lb = list_entry(b, struct radeon_bo_list, tv.head);
387 
388 	/* Sort A before B if A is smaller. */
389 	return (int)la->robj->tbo.num_pages - (int)lb->robj->tbo.num_pages;
390 }
391 
392 /**
393  * cs_parser_fini() - clean parser states
394  * @parser:	parser structure holding parsing context.
395  * @error:	error number
396  *
397  * If error is set than unvalidate buffer, otherwise just free memory
398  * used by parsing context.
399  **/
400 static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bool backoff)
401 {
402 	unsigned i;
403 
404 	if (!error) {
405 		/* Sort the buffer list from the smallest to largest buffer,
406 		 * which affects the order of buffers in the LRU list.
407 		 * This assures that the smallest buffers are added first
408 		 * to the LRU list, so they are likely to be later evicted
409 		 * first, instead of large buffers whose eviction is more
410 		 * expensive.
411 		 *
412 		 * This slightly lowers the number of bytes moved by TTM
413 		 * per frame under memory pressure.
414 		 */
415 		list_sort(NULL, &parser->validated, cmp_size_smaller_first);
416 
417 		ttm_eu_fence_buffer_objects(&parser->ticket,
418 					    &parser->validated,
419 					    &parser->ib.fence->base);
420 	} else if (backoff) {
421 		ttm_eu_backoff_reservation(&parser->ticket,
422 					   &parser->validated);
423 	}
424 
425 	if (parser->relocs != NULL) {
426 		for (i = 0; i < parser->nrelocs; i++) {
427 			struct radeon_bo *bo = parser->relocs[i].robj;
428 			if (bo == NULL)
429 				continue;
430 
431 			drm_gem_object_unreference_unlocked(&bo->gem_base);
432 		}
433 	}
434 	kfree(parser->track);
435 	drm_free_large(parser->relocs);
436 	drm_free_large(parser->vm_bos);
437 	for (i = 0; i < parser->nchunks; i++)
438 		drm_free_large(parser->chunks[i].kdata);
439 	kfree(parser->chunks);
440 	kfree(parser->chunks_array);
441 	radeon_ib_free(parser->rdev, &parser->ib);
442 	radeon_ib_free(parser->rdev, &parser->const_ib);
443 }
444 
445 static int radeon_cs_ib_chunk(struct radeon_device *rdev,
446 			      struct radeon_cs_parser *parser)
447 {
448 	int r;
449 
450 	if (parser->chunk_ib == NULL)
451 		return 0;
452 
453 	if (parser->cs_flags & RADEON_CS_USE_VM)
454 		return 0;
455 
456 	r = radeon_cs_parse(rdev, parser->ring, parser);
457 	if (r || parser->parser_error) {
458 		DRM_ERROR("Invalid command stream !\n");
459 		return r;
460 	}
461 
462 	r = radeon_cs_sync_rings(parser);
463 	if (r) {
464 		if (r != -ERESTARTSYS)
465 			DRM_ERROR("Failed to sync rings: %i\n", r);
466 		return r;
467 	}
468 
469 	if (parser->ring == R600_RING_TYPE_UVD_INDEX)
470 		radeon_uvd_note_usage(rdev);
471 	else if ((parser->ring == TN_RING_TYPE_VCE1_INDEX) ||
472 		 (parser->ring == TN_RING_TYPE_VCE2_INDEX))
473 		radeon_vce_note_usage(rdev);
474 
475 	r = radeon_ib_schedule(rdev, &parser->ib, NULL, true);
476 	if (r) {
477 		DRM_ERROR("Failed to schedule IB !\n");
478 	}
479 	return r;
480 }
481 
482 static int radeon_bo_vm_update_pte(struct radeon_cs_parser *p,
483 				   struct radeon_vm *vm)
484 {
485 	struct radeon_device *rdev = p->rdev;
486 	struct radeon_bo_va *bo_va;
487 	int i, r;
488 
489 	r = radeon_vm_update_page_directory(rdev, vm);
490 	if (r)
491 		return r;
492 
493 	r = radeon_vm_clear_freed(rdev, vm);
494 	if (r)
495 		return r;
496 
497 	if (vm->ib_bo_va == NULL) {
498 		DRM_ERROR("Tmp BO not in VM!\n");
499 		return -EINVAL;
500 	}
501 
502 	r = radeon_vm_bo_update(rdev, vm->ib_bo_va,
503 				&rdev->ring_tmp_bo.bo->tbo.mem);
504 	if (r)
505 		return r;
506 
507 	for (i = 0; i < p->nrelocs; i++) {
508 		struct radeon_bo *bo;
509 
510 		bo = p->relocs[i].robj;
511 		bo_va = radeon_vm_bo_find(vm, bo);
512 		if (bo_va == NULL) {
513 			dev_err(rdev->dev, "bo %p not in vm %p\n", bo, vm);
514 			return -EINVAL;
515 		}
516 
517 		r = radeon_vm_bo_update(rdev, bo_va, &bo->tbo.mem);
518 		if (r)
519 			return r;
520 
521 		radeon_sync_fence(&p->ib.sync, bo_va->last_pt_update);
522 	}
523 
524 	return radeon_vm_clear_invalids(rdev, vm);
525 }
526 
527 static int radeon_cs_ib_vm_chunk(struct radeon_device *rdev,
528 				 struct radeon_cs_parser *parser)
529 {
530 	struct radeon_fpriv *fpriv = parser->filp->driver_priv;
531 	struct radeon_vm *vm = &fpriv->vm;
532 	int r;
533 
534 	if (parser->chunk_ib == NULL)
535 		return 0;
536 	if ((parser->cs_flags & RADEON_CS_USE_VM) == 0)
537 		return 0;
538 
539 	if (parser->const_ib.length_dw) {
540 		r = radeon_ring_ib_parse(rdev, parser->ring, &parser->const_ib);
541 		if (r) {
542 			return r;
543 		}
544 	}
545 
546 	r = radeon_ring_ib_parse(rdev, parser->ring, &parser->ib);
547 	if (r) {
548 		return r;
549 	}
550 
551 	if (parser->ring == R600_RING_TYPE_UVD_INDEX)
552 		radeon_uvd_note_usage(rdev);
553 
554 	mutex_lock(&vm->mutex);
555 	r = radeon_bo_vm_update_pte(parser, vm);
556 	if (r) {
557 		goto out;
558 	}
559 
560 	r = radeon_cs_sync_rings(parser);
561 	if (r) {
562 		if (r != -ERESTARTSYS)
563 			DRM_ERROR("Failed to sync rings: %i\n", r);
564 		goto out;
565 	}
566 
567 	if ((rdev->family >= CHIP_TAHITI) &&
568 	    (parser->chunk_const_ib != NULL)) {
569 		r = radeon_ib_schedule(rdev, &parser->ib, &parser->const_ib, true);
570 	} else {
571 		r = radeon_ib_schedule(rdev, &parser->ib, NULL, true);
572 	}
573 
574 out:
575 	mutex_unlock(&vm->mutex);
576 	return r;
577 }
578 
579 static int radeon_cs_handle_lockup(struct radeon_device *rdev, int r)
580 {
581 	if (r == -EDEADLK) {
582 		r = radeon_gpu_reset(rdev);
583 		if (!r)
584 			r = -EAGAIN;
585 	}
586 	return r;
587 }
588 
589 static int radeon_cs_ib_fill(struct radeon_device *rdev, struct radeon_cs_parser *parser)
590 {
591 	struct radeon_cs_chunk *ib_chunk;
592 	struct radeon_vm *vm = NULL;
593 	int r;
594 
595 	if (parser->chunk_ib == NULL)
596 		return 0;
597 
598 	if (parser->cs_flags & RADEON_CS_USE_VM) {
599 		struct radeon_fpriv *fpriv = parser->filp->driver_priv;
600 		vm = &fpriv->vm;
601 
602 		if ((rdev->family >= CHIP_TAHITI) &&
603 		    (parser->chunk_const_ib != NULL)) {
604 			ib_chunk = parser->chunk_const_ib;
605 			if (ib_chunk->length_dw > RADEON_IB_VM_MAX_SIZE) {
606 				DRM_ERROR("cs IB CONST too big: %d\n", ib_chunk->length_dw);
607 				return -EINVAL;
608 			}
609 			r =  radeon_ib_get(rdev, parser->ring, &parser->const_ib,
610 					   vm, ib_chunk->length_dw * 4);
611 			if (r) {
612 				DRM_ERROR("Failed to get const ib !\n");
613 				return r;
614 			}
615 			parser->const_ib.is_const_ib = true;
616 			parser->const_ib.length_dw = ib_chunk->length_dw;
617 			if (copy_from_user(parser->const_ib.ptr,
618 					       ib_chunk->user_ptr,
619 					       ib_chunk->length_dw * 4))
620 				return -EFAULT;
621 		}
622 
623 		ib_chunk = parser->chunk_ib;
624 		if (ib_chunk->length_dw > RADEON_IB_VM_MAX_SIZE) {
625 			DRM_ERROR("cs IB too big: %d\n", ib_chunk->length_dw);
626 			return -EINVAL;
627 		}
628 	}
629 	ib_chunk = parser->chunk_ib;
630 
631 	r =  radeon_ib_get(rdev, parser->ring, &parser->ib,
632 			   vm, ib_chunk->length_dw * 4);
633 	if (r) {
634 		DRM_ERROR("Failed to get ib !\n");
635 		return r;
636 	}
637 	parser->ib.length_dw = ib_chunk->length_dw;
638 	if (ib_chunk->kdata)
639 		memcpy(parser->ib.ptr, ib_chunk->kdata, ib_chunk->length_dw * 4);
640 	else if (copy_from_user(parser->ib.ptr, ib_chunk->user_ptr, ib_chunk->length_dw * 4))
641 		return -EFAULT;
642 	return 0;
643 }
644 
645 int radeon_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
646 {
647 	struct radeon_device *rdev = dev->dev_private;
648 	struct radeon_cs_parser parser;
649 	int r;
650 
651 	down_read(&rdev->exclusive_lock);
652 	if (!rdev->accel_working) {
653 		up_read(&rdev->exclusive_lock);
654 		return -EBUSY;
655 	}
656 	if (rdev->in_reset) {
657 		up_read(&rdev->exclusive_lock);
658 		r = radeon_gpu_reset(rdev);
659 		if (!r)
660 			r = -EAGAIN;
661 		return r;
662 	}
663 	/* initialize parser */
664 	memset(&parser, 0, sizeof(struct radeon_cs_parser));
665 	parser.filp = filp;
666 	parser.rdev = rdev;
667 	parser.dev = rdev->dev;
668 	parser.family = rdev->family;
669 	r = radeon_cs_parser_init(&parser, data);
670 	if (r) {
671 		DRM_ERROR("Failed to initialize parser !\n");
672 		radeon_cs_parser_fini(&parser, r, false);
673 		up_read(&rdev->exclusive_lock);
674 		r = radeon_cs_handle_lockup(rdev, r);
675 		return r;
676 	}
677 
678 	r = radeon_cs_ib_fill(rdev, &parser);
679 	if (!r) {
680 		r = radeon_cs_parser_relocs(&parser);
681 		if (r && r != -ERESTARTSYS)
682 			DRM_ERROR("Failed to parse relocation %d!\n", r);
683 	}
684 
685 	if (r) {
686 		radeon_cs_parser_fini(&parser, r, false);
687 		up_read(&rdev->exclusive_lock);
688 		r = radeon_cs_handle_lockup(rdev, r);
689 		return r;
690 	}
691 
692 #ifdef TRACE_TODO
693 	trace_radeon_cs(&parser);
694 #endif
695 
696 	r = radeon_cs_ib_chunk(rdev, &parser);
697 	if (r) {
698 		goto out;
699 	}
700 	r = radeon_cs_ib_vm_chunk(rdev, &parser);
701 	if (r) {
702 		goto out;
703 	}
704 out:
705 	radeon_cs_parser_fini(&parser, r, true);
706 	up_read(&rdev->exclusive_lock);
707 	r = radeon_cs_handle_lockup(rdev, r);
708 	return r;
709 }
710 
711 /**
712  * radeon_cs_packet_parse() - parse cp packet and point ib index to next packet
713  * @parser:	parser structure holding parsing context.
714  * @pkt:	where to store packet information
715  *
716  * Assume that chunk_ib_index is properly set. Will return -EINVAL
717  * if packet is bigger than remaining ib size. or if packets is unknown.
718  **/
719 int radeon_cs_packet_parse(struct radeon_cs_parser *p,
720 			   struct radeon_cs_packet *pkt,
721 			   unsigned idx)
722 {
723 	struct radeon_cs_chunk *ib_chunk = p->chunk_ib;
724 	struct radeon_device *rdev = p->rdev;
725 	uint32_t header;
726 	int ret = 0, i;
727 
728 	if (idx >= ib_chunk->length_dw) {
729 		DRM_ERROR("Can not parse packet at %d after CS end %d !\n",
730 			  idx, ib_chunk->length_dw);
731 		return -EINVAL;
732 	}
733 	header = radeon_get_ib_value(p, idx);
734 	pkt->idx = idx;
735 	pkt->type = RADEON_CP_PACKET_GET_TYPE(header);
736 	pkt->count = RADEON_CP_PACKET_GET_COUNT(header);
737 	pkt->one_reg_wr = 0;
738 	switch (pkt->type) {
739 	case RADEON_PACKET_TYPE0:
740 		if (rdev->family < CHIP_R600) {
741 			pkt->reg = R100_CP_PACKET0_GET_REG(header);
742 			pkt->one_reg_wr =
743 				RADEON_CP_PACKET0_GET_ONE_REG_WR(header);
744 		} else
745 			pkt->reg = R600_CP_PACKET0_GET_REG(header);
746 		break;
747 	case RADEON_PACKET_TYPE3:
748 		pkt->opcode = RADEON_CP_PACKET3_GET_OPCODE(header);
749 		break;
750 	case RADEON_PACKET_TYPE2:
751 		pkt->count = -1;
752 		break;
753 	default:
754 		DRM_ERROR("Unknown packet type %d at %d !\n", pkt->type, idx);
755 		ret = -EINVAL;
756 		goto dump_ib;
757 	}
758 	if ((pkt->count + 1 + pkt->idx) >= ib_chunk->length_dw) {
759 		DRM_ERROR("Packet (%d:%d:%d) end after CS buffer (%d) !\n",
760 			  pkt->idx, pkt->type, pkt->count, ib_chunk->length_dw);
761 		ret = -EINVAL;
762 		goto dump_ib;
763 	}
764 	return 0;
765 
766 dump_ib:
767 	for (i = 0; i < ib_chunk->length_dw; i++) {
768 		if (i == idx)
769 			printk("\t0x%08x <---\n", radeon_get_ib_value(p, i));
770 		else
771 			printk("\t0x%08x\n", radeon_get_ib_value(p, i));
772 	}
773 	return ret;
774 }
775 
776 /**
777  * radeon_cs_packet_next_is_pkt3_nop() - test if the next packet is P3 NOP
778  * @p:		structure holding the parser context.
779  *
780  * Check if the next packet is NOP relocation packet3.
781  **/
782 bool radeon_cs_packet_next_is_pkt3_nop(struct radeon_cs_parser *p)
783 {
784 	struct radeon_cs_packet p3reloc;
785 	int r;
786 
787 	r = radeon_cs_packet_parse(p, &p3reloc, p->idx);
788 	if (r)
789 		return false;
790 	if (p3reloc.type != RADEON_PACKET_TYPE3)
791 		return false;
792 	if (p3reloc.opcode != RADEON_PACKET3_NOP)
793 		return false;
794 	return true;
795 }
796 
797 /**
798  * radeon_cs_dump_packet() - dump raw packet context
799  * @p:		structure holding the parser context.
800  * @pkt:	structure holding the packet.
801  *
802  * Used mostly for debugging and error reporting.
803  **/
804 void radeon_cs_dump_packet(struct radeon_cs_parser *p,
805 			   struct radeon_cs_packet *pkt)
806 {
807 	volatile uint32_t *ib;
808 	unsigned i;
809 	unsigned idx;
810 
811 	ib = p->ib.ptr;
812 	idx = pkt->idx;
813 	for (i = 0; i <= (pkt->count + 1); i++, idx++)
814 		DRM_INFO("ib[%d]=0x%08X\n", idx, ib[idx]);
815 }
816 
817 /**
818  * radeon_cs_packet_next_reloc() - parse next (should be reloc) packet
819  * @parser:		parser structure holding parsing context.
820  * @data:		pointer to relocation data
821  * @offset_start:	starting offset
822  * @offset_mask:	offset mask (to align start offset on)
823  * @reloc:		reloc informations
824  *
825  * Check if next packet is relocation packet3, do bo validation and compute
826  * GPU offset using the provided start.
827  **/
828 int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p,
829 				struct radeon_bo_list **cs_reloc,
830 				int nomm)
831 {
832 	struct radeon_cs_chunk *relocs_chunk;
833 	struct radeon_cs_packet p3reloc;
834 	unsigned idx;
835 	int r;
836 
837 	if (p->chunk_relocs == NULL) {
838 		DRM_ERROR("No relocation chunk !\n");
839 		return -EINVAL;
840 	}
841 	*cs_reloc = NULL;
842 	relocs_chunk = p->chunk_relocs;
843 	r = radeon_cs_packet_parse(p, &p3reloc, p->idx);
844 	if (r)
845 		return r;
846 	p->idx += p3reloc.count + 2;
847 	if (p3reloc.type != RADEON_PACKET_TYPE3 ||
848 	    p3reloc.opcode != RADEON_PACKET3_NOP) {
849 		DRM_ERROR("No packet3 for relocation for packet at %d.\n",
850 			  p3reloc.idx);
851 		radeon_cs_dump_packet(p, &p3reloc);
852 		return -EINVAL;
853 	}
854 	idx = radeon_get_ib_value(p, p3reloc.idx + 1);
855 	if (idx >= relocs_chunk->length_dw) {
856 		DRM_ERROR("Relocs at %d after relocations chunk end %d !\n",
857 			  idx, relocs_chunk->length_dw);
858 		radeon_cs_dump_packet(p, &p3reloc);
859 		return -EINVAL;
860 	}
861 	/* FIXME: we assume reloc size is 4 dwords */
862 	if (nomm) {
863 		*cs_reloc = p->relocs;
864 		(*cs_reloc)->gpu_offset =
865 			(u64)relocs_chunk->kdata[idx + 3] << 32;
866 		(*cs_reloc)->gpu_offset |= relocs_chunk->kdata[idx + 0];
867 	} else
868 		*cs_reloc = &p->relocs[(idx / 4)];
869 	return 0;
870 }
871