1 /************************************************************************** 2 * 3 * Copyright (c) 2006-2007 Tungsten Graphics, Inc., Cedar Park, TX., USA 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 21 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 22 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 23 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 24 * USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 /* 28 * Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com> 29 */ 30 31 #include <linux/dma-buf-map.h> 32 #include <linux/export.h> 33 #include <linux/highmem.h> 34 #include <linux/mem_encrypt.h> 35 #ifdef __linux__ 36 #include <xen/xen.h> 37 #endif 38 39 #include <drm/drm_cache.h> 40 41 /* A small bounce buffer that fits on the stack. */ 42 #define MEMCPY_BOUNCE_SIZE 128 43 44 #if defined(CONFIG_X86) 45 #include <asm/smp.h> 46 47 /* 48 * clflushopt is an unordered instruction which needs fencing with mfence or 49 * sfence to avoid ordering issues. For drm_clflush_page this fencing happens 50 * in the caller. 51 */ 52 static void 53 drm_clflush_page(struct vm_page *page) 54 { 55 uint8_t *page_virtual; 56 unsigned int i; 57 const int size = curcpu()->ci_cflushsz; 58 59 if (unlikely(page == NULL)) 60 return; 61 62 page_virtual = kmap_atomic(page); 63 for (i = 0; i < PAGE_SIZE; i += size) 64 clflushopt(page_virtual + i); 65 kunmap_atomic(page_virtual); 66 } 67 68 static void drm_cache_flush_clflush(struct vm_page *pages[], 69 unsigned long num_pages) 70 { 71 unsigned long i; 72 73 mb(); /*Full memory barrier used before so that CLFLUSH is ordered*/ 74 for (i = 0; i < num_pages; i++) 75 drm_clflush_page(*pages++); 76 mb(); /*Also used after CLFLUSH so that all cache is flushed*/ 77 } 78 #endif 79 80 /** 81 * drm_clflush_pages - Flush dcache lines of a set of pages. 82 * @pages: List of pages to be flushed. 83 * @num_pages: Number of pages in the array. 84 * 85 * Flush every data cache line entry that points to an address belonging 86 * to a page in the array. 87 */ 88 void 89 drm_clflush_pages(struct vm_page *pages[], unsigned long num_pages) 90 { 91 92 #if defined(CONFIG_X86) 93 if (static_cpu_has(X86_FEATURE_CLFLUSH)) { 94 drm_cache_flush_clflush(pages, num_pages); 95 return; 96 } 97 98 if (wbinvd_on_all_cpus()) 99 pr_err("Timed out waiting for cache flush\n"); 100 101 #elif defined(__powerpc__) && defined(__linux__) 102 unsigned long i; 103 104 for (i = 0; i < num_pages; i++) { 105 struct vm_page *page = pages[i]; 106 void *page_virtual; 107 108 if (unlikely(page == NULL)) 109 continue; 110 111 page_virtual = kmap_atomic(page); 112 flush_dcache_range((unsigned long)page_virtual, 113 (unsigned long)page_virtual + PAGE_SIZE); 114 kunmap_atomic(page_virtual); 115 } 116 #else 117 pr_err("Architecture has no drm_cache.c support\n"); 118 WARN_ON_ONCE(1); 119 #endif 120 } 121 EXPORT_SYMBOL(drm_clflush_pages); 122 123 /** 124 * drm_clflush_sg - Flush dcache lines pointing to a scather-gather. 125 * @st: struct sg_table. 126 * 127 * Flush every data cache line entry that points to an address in the 128 * sg. 129 */ 130 void 131 drm_clflush_sg(struct sg_table *st) 132 { 133 #if defined(CONFIG_X86) 134 if (static_cpu_has(X86_FEATURE_CLFLUSH)) { 135 struct sg_page_iter sg_iter; 136 137 mb(); /*CLFLUSH is ordered only by using memory barriers*/ 138 for_each_sgtable_page(st, &sg_iter, 0) 139 drm_clflush_page(sg_page_iter_page(&sg_iter)); 140 mb(); /*Make sure that all cache line entry is flushed*/ 141 142 return; 143 } 144 145 if (wbinvd_on_all_cpus()) 146 pr_err("Timed out waiting for cache flush\n"); 147 #else 148 pr_err("Architecture has no drm_cache.c support\n"); 149 WARN_ON_ONCE(1); 150 #endif 151 } 152 EXPORT_SYMBOL(drm_clflush_sg); 153 154 /** 155 * drm_clflush_virt_range - Flush dcache lines of a region 156 * @addr: Initial kernel memory address. 157 * @length: Region size. 158 * 159 * Flush every data cache line entry that points to an address in the 160 * region requested. 161 */ 162 void 163 drm_clflush_virt_range(void *addr, unsigned long length) 164 { 165 #if defined(CONFIG_X86) 166 if (static_cpu_has(X86_FEATURE_CLFLUSH)) { 167 const int size = curcpu()->ci_cflushsz; 168 void *end = addr + length; 169 170 addr = (void *)(((unsigned long)addr) & -size); 171 mb(); /*CLFLUSH is only ordered with a full memory barrier*/ 172 for (; addr < end; addr += size) 173 clflushopt(addr); 174 clflushopt(end - 1); /* force serialisation */ 175 mb(); /*Ensure that every data cache line entry is flushed*/ 176 return; 177 } 178 179 if (wbinvd_on_all_cpus()) 180 pr_err("Timed out waiting for cache flush\n"); 181 #else 182 pr_err("Architecture has no drm_cache.c support\n"); 183 WARN_ON_ONCE(1); 184 #endif 185 } 186 EXPORT_SYMBOL(drm_clflush_virt_range); 187 188 bool drm_need_swiotlb(int dma_bits) 189 { 190 return false; 191 #ifdef notyet 192 struct resource *tmp; 193 resource_size_t max_iomem = 0; 194 195 /* 196 * Xen paravirtual hosts require swiotlb regardless of requested dma 197 * transfer size. 198 * 199 * NOTE: Really, what it requires is use of the dma_alloc_coherent 200 * allocator used in ttm_dma_populate() instead of 201 * ttm_populate_and_map_pages(), which bounce buffers so much in 202 * Xen it leads to swiotlb buffer exhaustion. 203 */ 204 if (xen_pv_domain()) 205 return true; 206 207 /* 208 * Enforce dma_alloc_coherent when memory encryption is active as well 209 * for the same reasons as for Xen paravirtual hosts. 210 */ 211 if (mem_encrypt_active()) 212 return true; 213 214 for (tmp = iomem_resource.child; tmp; tmp = tmp->sibling) 215 max_iomem = max(max_iomem, tmp->end); 216 217 return max_iomem > ((u64)1 << dma_bits); 218 #endif 219 } 220 EXPORT_SYMBOL(drm_need_swiotlb); 221 222 static void memcpy_fallback(struct dma_buf_map *dst, 223 const struct dma_buf_map *src, 224 unsigned long len) 225 { 226 if (!dst->is_iomem && !src->is_iomem) { 227 memcpy(dst->vaddr, src->vaddr, len); 228 } else if (!src->is_iomem) { 229 dma_buf_map_memcpy_to(dst, src->vaddr, len); 230 } else if (!dst->is_iomem) { 231 memcpy_fromio(dst->vaddr, src->vaddr_iomem, len); 232 } else { 233 /* 234 * Bounce size is not performance tuned, but using a 235 * bounce buffer like this is significantly faster than 236 * resorting to ioreadxx() + iowritexx(). 237 */ 238 char bounce[MEMCPY_BOUNCE_SIZE]; 239 void __iomem *_src = src->vaddr_iomem; 240 void __iomem *_dst = dst->vaddr_iomem; 241 242 while (len >= MEMCPY_BOUNCE_SIZE) { 243 memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE); 244 memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE); 245 _src += MEMCPY_BOUNCE_SIZE; 246 _dst += MEMCPY_BOUNCE_SIZE; 247 len -= MEMCPY_BOUNCE_SIZE; 248 } 249 if (len) { 250 memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE); 251 memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE); 252 } 253 } 254 } 255 256 #ifdef CONFIG_X86 257 258 #ifdef __linux__ 259 static DEFINE_STATIC_KEY_FALSE(has_movntdqa); 260 #else 261 static int has_movntdqa; 262 263 #include <asm/fpu/api.h> 264 265 static inline void 266 static_branch_enable(int *x) 267 { 268 *x = 1; 269 } 270 271 static inline int 272 static_branch_likely(int *x) 273 { 274 return (likely(*x == 1)); 275 } 276 277 #endif 278 279 static void __memcpy_ntdqa(void *dst, const void *src, unsigned long len) 280 { 281 kernel_fpu_begin(); 282 283 while (len >= 4) { 284 asm("movntdqa (%0), %%xmm0\n" 285 "movntdqa 16(%0), %%xmm1\n" 286 "movntdqa 32(%0), %%xmm2\n" 287 "movntdqa 48(%0), %%xmm3\n" 288 "movaps %%xmm0, (%1)\n" 289 "movaps %%xmm1, 16(%1)\n" 290 "movaps %%xmm2, 32(%1)\n" 291 "movaps %%xmm3, 48(%1)\n" 292 :: "r" (src), "r" (dst) : "memory"); 293 src += 64; 294 dst += 64; 295 len -= 4; 296 } 297 while (len--) { 298 asm("movntdqa (%0), %%xmm0\n" 299 "movaps %%xmm0, (%1)\n" 300 :: "r" (src), "r" (dst) : "memory"); 301 src += 16; 302 dst += 16; 303 } 304 305 kernel_fpu_end(); 306 } 307 308 /* 309 * __drm_memcpy_from_wc copies @len bytes from @src to @dst using 310 * non-temporal instructions where available. Note that all arguments 311 * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple 312 * of 16. 313 */ 314 static void __drm_memcpy_from_wc(void *dst, const void *src, unsigned long len) 315 { 316 if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15)) 317 memcpy(dst, src, len); 318 else if (likely(len)) 319 __memcpy_ntdqa(dst, src, len >> 4); 320 } 321 322 /** 323 * drm_memcpy_from_wc - Perform the fastest available memcpy from a source 324 * that may be WC. 325 * @dst: The destination pointer 326 * @src: The source pointer 327 * @len: The size of the area o transfer in bytes 328 * 329 * Tries an arch optimized memcpy for prefetching reading out of a WC region, 330 * and if no such beast is available, falls back to a normal memcpy. 331 */ 332 void drm_memcpy_from_wc(struct dma_buf_map *dst, 333 const struct dma_buf_map *src, 334 unsigned long len) 335 { 336 if (WARN_ON(in_interrupt())) { 337 memcpy_fallback(dst, src, len); 338 return; 339 } 340 341 if (static_branch_likely(&has_movntdqa)) { 342 __drm_memcpy_from_wc(dst->is_iomem ? 343 (void __force *)dst->vaddr_iomem : 344 dst->vaddr, 345 src->is_iomem ? 346 (void const __force *)src->vaddr_iomem : 347 src->vaddr, 348 len); 349 return; 350 } 351 352 memcpy_fallback(dst, src, len); 353 } 354 EXPORT_SYMBOL(drm_memcpy_from_wc); 355 356 /* 357 * drm_memcpy_init_early - One time initialization of the WC memcpy code 358 */ 359 void drm_memcpy_init_early(void) 360 { 361 /* 362 * Some hypervisors (e.g. KVM) don't support VEX-prefix instructions 363 * emulation. So don't enable movntdqa in hypervisor guest. 364 */ 365 if (static_cpu_has(X86_FEATURE_XMM4_1) && 366 !boot_cpu_has(X86_FEATURE_HYPERVISOR)) 367 static_branch_enable(&has_movntdqa); 368 } 369 #else 370 void drm_memcpy_from_wc(struct dma_buf_map *dst, 371 const struct dma_buf_map *src, 372 unsigned long len) 373 { 374 WARN_ON(in_interrupt()); 375 376 memcpy_fallback(dst, src, len); 377 } 378 EXPORT_SYMBOL(drm_memcpy_from_wc); 379 380 void drm_memcpy_init_early(void) 381 { 382 } 383 #endif /* CONFIG_X86 */ 384