]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/pci/intel-iommu.c
iommu bitmap instead of iommu pointer in dmar_domain
[linux-2.6-omap-h63xx.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
62
63 /*
64  * 0: Present
65  * 1-11: Reserved
66  * 12-63: Context Ptr (12 - (haw-1))
67  * 64-127: Reserved
68  */
69 struct root_entry {
70         u64     val;
71         u64     rsvd1;
72 };
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
75 {
76         return (root->val & 1);
77 }
78 static inline void set_root_present(struct root_entry *root)
79 {
80         root->val |= 1;
81 }
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
83 {
84         root->val |= value & VTD_PAGE_MASK;
85 }
86
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
89 {
90         return (struct context_entry *)
91                 (root_present(root)?phys_to_virt(
92                 root->val & VTD_PAGE_MASK) :
93                 NULL);
94 }
95
96 /*
97  * low 64 bits:
98  * 0: present
99  * 1: fault processing disable
100  * 2-3: translation type
101  * 12-63: address space root
102  * high 64 bits:
103  * 0-2: address width
104  * 3-6: aval
105  * 8-23: domain id
106  */
107 struct context_entry {
108         u64 lo;
109         u64 hi;
110 };
111
112 static inline bool context_present(struct context_entry *context)
113 {
114         return (context->lo & 1);
115 }
116 static inline void context_set_present(struct context_entry *context)
117 {
118         context->lo |= 1;
119 }
120
121 static inline void context_set_fault_enable(struct context_entry *context)
122 {
123         context->lo &= (((u64)-1) << 2) | 1;
124 }
125
126 #define CONTEXT_TT_MULTI_LEVEL 0
127
128 static inline void context_set_translation_type(struct context_entry *context,
129                                                 unsigned long value)
130 {
131         context->lo &= (((u64)-1) << 4) | 3;
132         context->lo |= (value & 3) << 2;
133 }
134
135 static inline void context_set_address_root(struct context_entry *context,
136                                             unsigned long value)
137 {
138         context->lo |= value & VTD_PAGE_MASK;
139 }
140
141 static inline void context_set_address_width(struct context_entry *context,
142                                              unsigned long value)
143 {
144         context->hi |= value & 7;
145 }
146
147 static inline void context_set_domain_id(struct context_entry *context,
148                                          unsigned long value)
149 {
150         context->hi |= (value & ((1 << 16) - 1)) << 8;
151 }
152
153 static inline void context_clear_entry(struct context_entry *context)
154 {
155         context->lo = 0;
156         context->hi = 0;
157 }
158
159 /*
160  * 0: readable
161  * 1: writable
162  * 2-6: reserved
163  * 7: super page
164  * 8-11: available
165  * 12-63: Host physcial address
166  */
167 struct dma_pte {
168         u64 val;
169 };
170
171 static inline void dma_clear_pte(struct dma_pte *pte)
172 {
173         pte->val = 0;
174 }
175
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
177 {
178         pte->val |= DMA_PTE_READ;
179 }
180
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_WRITE;
184 }
185
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
187 {
188         pte->val = (pte->val & ~3) | (prot & 3);
189 }
190
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
192 {
193         return (pte->val & VTD_PAGE_MASK);
194 }
195
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
197 {
198         pte->val |= (addr & VTD_PAGE_MASK);
199 }
200
201 static inline bool dma_pte_present(struct dma_pte *pte)
202 {
203         return (pte->val & 3) != 0;
204 }
205
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
208
209 struct dmar_domain {
210         int     id;                     /* domain id */
211         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
212
213         struct list_head devices;       /* all devices' list */
214         struct iova_domain iovad;       /* iova's that belong to this domain */
215
216         struct dma_pte  *pgd;           /* virtual address */
217         spinlock_t      mapping_lock;   /* page table lock */
218         int             gaw;            /* max guest address width */
219
220         /* adjusted guest address width, 0 is level 2 30-bit */
221         int             agaw;
222
223         int             flags;          /* flags to find out type of domain */
224 };
225
226 /* PCI domain-device relationship */
227 struct device_domain_info {
228         struct list_head link;  /* link to domain siblings */
229         struct list_head global; /* link to global list */
230         u8 bus;                 /* PCI bus numer */
231         u8 devfn;               /* PCI devfn number */
232         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
233         struct dmar_domain *domain; /* pointer to domain */
234 };
235
236 static void flush_unmaps_timeout(unsigned long data);
237
238 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
239
240 #define HIGH_WATER_MARK 250
241 struct deferred_flush_tables {
242         int next;
243         struct iova *iova[HIGH_WATER_MARK];
244         struct dmar_domain *domain[HIGH_WATER_MARK];
245 };
246
247 static struct deferred_flush_tables *deferred_flush;
248
249 /* bitmap for indexing intel_iommus */
250 static int g_num_of_iommus;
251
252 static DEFINE_SPINLOCK(async_umap_flush_lock);
253 static LIST_HEAD(unmaps_to_do);
254
255 static int timer_on;
256 static long list_size;
257
258 static void domain_remove_dev_info(struct dmar_domain *domain);
259
260 int dmar_disabled;
261 static int __initdata dmar_map_gfx = 1;
262 static int dmar_forcedac;
263 static int intel_iommu_strict;
264
265 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
266 static DEFINE_SPINLOCK(device_domain_lock);
267 static LIST_HEAD(device_domain_list);
268
269 static int __init intel_iommu_setup(char *str)
270 {
271         if (!str)
272                 return -EINVAL;
273         while (*str) {
274                 if (!strncmp(str, "off", 3)) {
275                         dmar_disabled = 1;
276                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
277                 } else if (!strncmp(str, "igfx_off", 8)) {
278                         dmar_map_gfx = 0;
279                         printk(KERN_INFO
280                                 "Intel-IOMMU: disable GFX device mapping\n");
281                 } else if (!strncmp(str, "forcedac", 8)) {
282                         printk(KERN_INFO
283                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
284                         dmar_forcedac = 1;
285                 } else if (!strncmp(str, "strict", 6)) {
286                         printk(KERN_INFO
287                                 "Intel-IOMMU: disable batched IOTLB flush\n");
288                         intel_iommu_strict = 1;
289                 }
290
291                 str += strcspn(str, ",");
292                 while (*str == ',')
293                         str++;
294         }
295         return 0;
296 }
297 __setup("intel_iommu=", intel_iommu_setup);
298
299 static struct kmem_cache *iommu_domain_cache;
300 static struct kmem_cache *iommu_devinfo_cache;
301 static struct kmem_cache *iommu_iova_cache;
302
303 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
304 {
305         unsigned int flags;
306         void *vaddr;
307
308         /* trying to avoid low memory issues */
309         flags = current->flags & PF_MEMALLOC;
310         current->flags |= PF_MEMALLOC;
311         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
312         current->flags &= (~PF_MEMALLOC | flags);
313         return vaddr;
314 }
315
316
317 static inline void *alloc_pgtable_page(void)
318 {
319         unsigned int flags;
320         void *vaddr;
321
322         /* trying to avoid low memory issues */
323         flags = current->flags & PF_MEMALLOC;
324         current->flags |= PF_MEMALLOC;
325         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
326         current->flags &= (~PF_MEMALLOC | flags);
327         return vaddr;
328 }
329
330 static inline void free_pgtable_page(void *vaddr)
331 {
332         free_page((unsigned long)vaddr);
333 }
334
335 static inline void *alloc_domain_mem(void)
336 {
337         return iommu_kmem_cache_alloc(iommu_domain_cache);
338 }
339
340 static void free_domain_mem(void *vaddr)
341 {
342         kmem_cache_free(iommu_domain_cache, vaddr);
343 }
344
345 static inline void * alloc_devinfo_mem(void)
346 {
347         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
348 }
349
350 static inline void free_devinfo_mem(void *vaddr)
351 {
352         kmem_cache_free(iommu_devinfo_cache, vaddr);
353 }
354
355 struct iova *alloc_iova_mem(void)
356 {
357         return iommu_kmem_cache_alloc(iommu_iova_cache);
358 }
359
360 void free_iova_mem(struct iova *iova)
361 {
362         kmem_cache_free(iommu_iova_cache, iova);
363 }
364
365 /* in native case, each domain is related to only one iommu */
366 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
367 {
368         int iommu_id;
369
370         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
371         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
372                 return NULL;
373
374         return g_iommus[iommu_id];
375 }
376
377 /* Gets context entry for a given bus and devfn */
378 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
379                 u8 bus, u8 devfn)
380 {
381         struct root_entry *root;
382         struct context_entry *context;
383         unsigned long phy_addr;
384         unsigned long flags;
385
386         spin_lock_irqsave(&iommu->lock, flags);
387         root = &iommu->root_entry[bus];
388         context = get_context_addr_from_root(root);
389         if (!context) {
390                 context = (struct context_entry *)alloc_pgtable_page();
391                 if (!context) {
392                         spin_unlock_irqrestore(&iommu->lock, flags);
393                         return NULL;
394                 }
395                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
396                 phy_addr = virt_to_phys((void *)context);
397                 set_root_value(root, phy_addr);
398                 set_root_present(root);
399                 __iommu_flush_cache(iommu, root, sizeof(*root));
400         }
401         spin_unlock_irqrestore(&iommu->lock, flags);
402         return &context[devfn];
403 }
404
405 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
406 {
407         struct root_entry *root;
408         struct context_entry *context;
409         int ret;
410         unsigned long flags;
411
412         spin_lock_irqsave(&iommu->lock, flags);
413         root = &iommu->root_entry[bus];
414         context = get_context_addr_from_root(root);
415         if (!context) {
416                 ret = 0;
417                 goto out;
418         }
419         ret = context_present(&context[devfn]);
420 out:
421         spin_unlock_irqrestore(&iommu->lock, flags);
422         return ret;
423 }
424
425 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
426 {
427         struct root_entry *root;
428         struct context_entry *context;
429         unsigned long flags;
430
431         spin_lock_irqsave(&iommu->lock, flags);
432         root = &iommu->root_entry[bus];
433         context = get_context_addr_from_root(root);
434         if (context) {
435                 context_clear_entry(&context[devfn]);
436                 __iommu_flush_cache(iommu, &context[devfn], \
437                         sizeof(*context));
438         }
439         spin_unlock_irqrestore(&iommu->lock, flags);
440 }
441
442 static void free_context_table(struct intel_iommu *iommu)
443 {
444         struct root_entry *root;
445         int i;
446         unsigned long flags;
447         struct context_entry *context;
448
449         spin_lock_irqsave(&iommu->lock, flags);
450         if (!iommu->root_entry) {
451                 goto out;
452         }
453         for (i = 0; i < ROOT_ENTRY_NR; i++) {
454                 root = &iommu->root_entry[i];
455                 context = get_context_addr_from_root(root);
456                 if (context)
457                         free_pgtable_page(context);
458         }
459         free_pgtable_page(iommu->root_entry);
460         iommu->root_entry = NULL;
461 out:
462         spin_unlock_irqrestore(&iommu->lock, flags);
463 }
464
465 /* page table handling */
466 #define LEVEL_STRIDE            (9)
467 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
468
469 static inline int agaw_to_level(int agaw)
470 {
471         return agaw + 2;
472 }
473
474 static inline int agaw_to_width(int agaw)
475 {
476         return 30 + agaw * LEVEL_STRIDE;
477
478 }
479
480 static inline int width_to_agaw(int width)
481 {
482         return (width - 30) / LEVEL_STRIDE;
483 }
484
485 static inline unsigned int level_to_offset_bits(int level)
486 {
487         return (12 + (level - 1) * LEVEL_STRIDE);
488 }
489
490 static inline int address_level_offset(u64 addr, int level)
491 {
492         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
493 }
494
495 static inline u64 level_mask(int level)
496 {
497         return ((u64)-1 << level_to_offset_bits(level));
498 }
499
500 static inline u64 level_size(int level)
501 {
502         return ((u64)1 << level_to_offset_bits(level));
503 }
504
505 static inline u64 align_to_level(u64 addr, int level)
506 {
507         return ((addr + level_size(level) - 1) & level_mask(level));
508 }
509
510 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
511 {
512         int addr_width = agaw_to_width(domain->agaw);
513         struct dma_pte *parent, *pte = NULL;
514         int level = agaw_to_level(domain->agaw);
515         int offset;
516         unsigned long flags;
517         struct intel_iommu *iommu = domain_get_iommu(domain);
518
519         BUG_ON(!domain->pgd);
520
521         addr &= (((u64)1) << addr_width) - 1;
522         parent = domain->pgd;
523
524         spin_lock_irqsave(&domain->mapping_lock, flags);
525         while (level > 0) {
526                 void *tmp_page;
527
528                 offset = address_level_offset(addr, level);
529                 pte = &parent[offset];
530                 if (level == 1)
531                         break;
532
533                 if (!dma_pte_present(pte)) {
534                         tmp_page = alloc_pgtable_page();
535
536                         if (!tmp_page) {
537                                 spin_unlock_irqrestore(&domain->mapping_lock,
538                                         flags);
539                                 return NULL;
540                         }
541                         __iommu_flush_cache(iommu, tmp_page,
542                                         PAGE_SIZE);
543                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
544                         /*
545                          * high level table always sets r/w, last level page
546                          * table control read/write
547                          */
548                         dma_set_pte_readable(pte);
549                         dma_set_pte_writable(pte);
550                         __iommu_flush_cache(iommu, pte, sizeof(*pte));
551                 }
552                 parent = phys_to_virt(dma_pte_addr(pte));
553                 level--;
554         }
555
556         spin_unlock_irqrestore(&domain->mapping_lock, flags);
557         return pte;
558 }
559
560 /* return address's pte at specific level */
561 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
562                 int level)
563 {
564         struct dma_pte *parent, *pte = NULL;
565         int total = agaw_to_level(domain->agaw);
566         int offset;
567
568         parent = domain->pgd;
569         while (level <= total) {
570                 offset = address_level_offset(addr, total);
571                 pte = &parent[offset];
572                 if (level == total)
573                         return pte;
574
575                 if (!dma_pte_present(pte))
576                         break;
577                 parent = phys_to_virt(dma_pte_addr(pte));
578                 total--;
579         }
580         return NULL;
581 }
582
583 /* clear one page's page table */
584 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
585 {
586         struct dma_pte *pte = NULL;
587         struct intel_iommu *iommu = domain_get_iommu(domain);
588
589         /* get last level pte */
590         pte = dma_addr_level_pte(domain, addr, 1);
591
592         if (pte) {
593                 dma_clear_pte(pte);
594                 __iommu_flush_cache(iommu, pte, sizeof(*pte));
595         }
596 }
597
598 /* clear last level pte, a tlb flush should be followed */
599 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
600 {
601         int addr_width = agaw_to_width(domain->agaw);
602
603         start &= (((u64)1) << addr_width) - 1;
604         end &= (((u64)1) << addr_width) - 1;
605         /* in case it's partial page */
606         start = PAGE_ALIGN(start);
607         end &= PAGE_MASK;
608
609         /* we don't need lock here, nobody else touches the iova range */
610         while (start < end) {
611                 dma_pte_clear_one(domain, start);
612                 start += VTD_PAGE_SIZE;
613         }
614 }
615
616 /* free page table pages. last level pte should already be cleared */
617 static void dma_pte_free_pagetable(struct dmar_domain *domain,
618         u64 start, u64 end)
619 {
620         int addr_width = agaw_to_width(domain->agaw);
621         struct dma_pte *pte;
622         int total = agaw_to_level(domain->agaw);
623         int level;
624         u64 tmp;
625         struct intel_iommu *iommu = domain_get_iommu(domain);
626
627         start &= (((u64)1) << addr_width) - 1;
628         end &= (((u64)1) << addr_width) - 1;
629
630         /* we don't need lock here, nobody else touches the iova range */
631         level = 2;
632         while (level <= total) {
633                 tmp = align_to_level(start, level);
634                 if (tmp >= end || (tmp + level_size(level) > end))
635                         return;
636
637                 while (tmp < end) {
638                         pte = dma_addr_level_pte(domain, tmp, level);
639                         if (pte) {
640                                 free_pgtable_page(
641                                         phys_to_virt(dma_pte_addr(pte)));
642                                 dma_clear_pte(pte);
643                                 __iommu_flush_cache(iommu,
644                                                 pte, sizeof(*pte));
645                         }
646                         tmp += level_size(level);
647                 }
648                 level++;
649         }
650         /* free pgd */
651         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
652                 free_pgtable_page(domain->pgd);
653                 domain->pgd = NULL;
654         }
655 }
656
657 /* iommu handling */
658 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
659 {
660         struct root_entry *root;
661         unsigned long flags;
662
663         root = (struct root_entry *)alloc_pgtable_page();
664         if (!root)
665                 return -ENOMEM;
666
667         __iommu_flush_cache(iommu, root, ROOT_SIZE);
668
669         spin_lock_irqsave(&iommu->lock, flags);
670         iommu->root_entry = root;
671         spin_unlock_irqrestore(&iommu->lock, flags);
672
673         return 0;
674 }
675
676 static void iommu_set_root_entry(struct intel_iommu *iommu)
677 {
678         void *addr;
679         u32 cmd, sts;
680         unsigned long flag;
681
682         addr = iommu->root_entry;
683
684         spin_lock_irqsave(&iommu->register_lock, flag);
685         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
686
687         cmd = iommu->gcmd | DMA_GCMD_SRTP;
688         writel(cmd, iommu->reg + DMAR_GCMD_REG);
689
690         /* Make sure hardware complete it */
691         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
692                 readl, (sts & DMA_GSTS_RTPS), sts);
693
694         spin_unlock_irqrestore(&iommu->register_lock, flag);
695 }
696
697 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
698 {
699         u32 val;
700         unsigned long flag;
701
702         if (!cap_rwbf(iommu->cap))
703                 return;
704         val = iommu->gcmd | DMA_GCMD_WBF;
705
706         spin_lock_irqsave(&iommu->register_lock, flag);
707         writel(val, iommu->reg + DMAR_GCMD_REG);
708
709         /* Make sure hardware complete it */
710         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
711                         readl, (!(val & DMA_GSTS_WBFS)), val);
712
713         spin_unlock_irqrestore(&iommu->register_lock, flag);
714 }
715
716 /* return value determine if we need a write buffer flush */
717 static int __iommu_flush_context(struct intel_iommu *iommu,
718         u16 did, u16 source_id, u8 function_mask, u64 type,
719         int non_present_entry_flush)
720 {
721         u64 val = 0;
722         unsigned long flag;
723
724         /*
725          * In the non-present entry flush case, if hardware doesn't cache
726          * non-present entry we do nothing and if hardware cache non-present
727          * entry, we flush entries of domain 0 (the domain id is used to cache
728          * any non-present entries)
729          */
730         if (non_present_entry_flush) {
731                 if (!cap_caching_mode(iommu->cap))
732                         return 1;
733                 else
734                         did = 0;
735         }
736
737         switch (type) {
738         case DMA_CCMD_GLOBAL_INVL:
739                 val = DMA_CCMD_GLOBAL_INVL;
740                 break;
741         case DMA_CCMD_DOMAIN_INVL:
742                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
743                 break;
744         case DMA_CCMD_DEVICE_INVL:
745                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
746                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
747                 break;
748         default:
749                 BUG();
750         }
751         val |= DMA_CCMD_ICC;
752
753         spin_lock_irqsave(&iommu->register_lock, flag);
754         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
755
756         /* Make sure hardware complete it */
757         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
758                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
759
760         spin_unlock_irqrestore(&iommu->register_lock, flag);
761
762         /* flush context entry will implicitly flush write buffer */
763         return 0;
764 }
765
766 /* return value determine if we need a write buffer flush */
767 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
768         u64 addr, unsigned int size_order, u64 type,
769         int non_present_entry_flush)
770 {
771         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
772         u64 val = 0, val_iva = 0;
773         unsigned long flag;
774
775         /*
776          * In the non-present entry flush case, if hardware doesn't cache
777          * non-present entry we do nothing and if hardware cache non-present
778          * entry, we flush entries of domain 0 (the domain id is used to cache
779          * any non-present entries)
780          */
781         if (non_present_entry_flush) {
782                 if (!cap_caching_mode(iommu->cap))
783                         return 1;
784                 else
785                         did = 0;
786         }
787
788         switch (type) {
789         case DMA_TLB_GLOBAL_FLUSH:
790                 /* global flush doesn't need set IVA_REG */
791                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
792                 break;
793         case DMA_TLB_DSI_FLUSH:
794                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
795                 break;
796         case DMA_TLB_PSI_FLUSH:
797                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
798                 /* Note: always flush non-leaf currently */
799                 val_iva = size_order | addr;
800                 break;
801         default:
802                 BUG();
803         }
804         /* Note: set drain read/write */
805 #if 0
806         /*
807          * This is probably to be super secure.. Looks like we can
808          * ignore it without any impact.
809          */
810         if (cap_read_drain(iommu->cap))
811                 val |= DMA_TLB_READ_DRAIN;
812 #endif
813         if (cap_write_drain(iommu->cap))
814                 val |= DMA_TLB_WRITE_DRAIN;
815
816         spin_lock_irqsave(&iommu->register_lock, flag);
817         /* Note: Only uses first TLB reg currently */
818         if (val_iva)
819                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
820         dmar_writeq(iommu->reg + tlb_offset + 8, val);
821
822         /* Make sure hardware complete it */
823         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
824                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
825
826         spin_unlock_irqrestore(&iommu->register_lock, flag);
827
828         /* check IOTLB invalidation granularity */
829         if (DMA_TLB_IAIG(val) == 0)
830                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
831         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
832                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
833                         (unsigned long long)DMA_TLB_IIRG(type),
834                         (unsigned long long)DMA_TLB_IAIG(val));
835         /* flush iotlb entry will implicitly flush write buffer */
836         return 0;
837 }
838
839 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
840         u64 addr, unsigned int pages, int non_present_entry_flush)
841 {
842         unsigned int mask;
843
844         BUG_ON(addr & (~VTD_PAGE_MASK));
845         BUG_ON(pages == 0);
846
847         /* Fallback to domain selective flush if no PSI support */
848         if (!cap_pgsel_inv(iommu->cap))
849                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
850                                                 DMA_TLB_DSI_FLUSH,
851                                                 non_present_entry_flush);
852
853         /*
854          * PSI requires page size to be 2 ^ x, and the base address is naturally
855          * aligned to the size
856          */
857         mask = ilog2(__roundup_pow_of_two(pages));
858         /* Fallback to domain selective flush if size is too big */
859         if (mask > cap_max_amask_val(iommu->cap))
860                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
861                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
862
863         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
864                                         DMA_TLB_PSI_FLUSH,
865                                         non_present_entry_flush);
866 }
867
868 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
869 {
870         u32 pmen;
871         unsigned long flags;
872
873         spin_lock_irqsave(&iommu->register_lock, flags);
874         pmen = readl(iommu->reg + DMAR_PMEN_REG);
875         pmen &= ~DMA_PMEN_EPM;
876         writel(pmen, iommu->reg + DMAR_PMEN_REG);
877
878         /* wait for the protected region status bit to clear */
879         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
880                 readl, !(pmen & DMA_PMEN_PRS), pmen);
881
882         spin_unlock_irqrestore(&iommu->register_lock, flags);
883 }
884
885 static int iommu_enable_translation(struct intel_iommu *iommu)
886 {
887         u32 sts;
888         unsigned long flags;
889
890         spin_lock_irqsave(&iommu->register_lock, flags);
891         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
892
893         /* Make sure hardware complete it */
894         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
895                 readl, (sts & DMA_GSTS_TES), sts);
896
897         iommu->gcmd |= DMA_GCMD_TE;
898         spin_unlock_irqrestore(&iommu->register_lock, flags);
899         return 0;
900 }
901
902 static int iommu_disable_translation(struct intel_iommu *iommu)
903 {
904         u32 sts;
905         unsigned long flag;
906
907         spin_lock_irqsave(&iommu->register_lock, flag);
908         iommu->gcmd &= ~DMA_GCMD_TE;
909         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
910
911         /* Make sure hardware complete it */
912         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
913                 readl, (!(sts & DMA_GSTS_TES)), sts);
914
915         spin_unlock_irqrestore(&iommu->register_lock, flag);
916         return 0;
917 }
918
919 /* iommu interrupt handling. Most stuff are MSI-like. */
920
921 static const char *fault_reason_strings[] =
922 {
923         "Software",
924         "Present bit in root entry is clear",
925         "Present bit in context entry is clear",
926         "Invalid context entry",
927         "Access beyond MGAW",
928         "PTE Write access is not set",
929         "PTE Read access is not set",
930         "Next page table ptr is invalid",
931         "Root table address invalid",
932         "Context table ptr is invalid",
933         "non-zero reserved fields in RTP",
934         "non-zero reserved fields in CTP",
935         "non-zero reserved fields in PTE",
936 };
937 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
938
939 const char *dmar_get_fault_reason(u8 fault_reason)
940 {
941         if (fault_reason > MAX_FAULT_REASON_IDX)
942                 return "Unknown";
943         else
944                 return fault_reason_strings[fault_reason];
945 }
946
947 void dmar_msi_unmask(unsigned int irq)
948 {
949         struct intel_iommu *iommu = get_irq_data(irq);
950         unsigned long flag;
951
952         /* unmask it */
953         spin_lock_irqsave(&iommu->register_lock, flag);
954         writel(0, iommu->reg + DMAR_FECTL_REG);
955         /* Read a reg to force flush the post write */
956         readl(iommu->reg + DMAR_FECTL_REG);
957         spin_unlock_irqrestore(&iommu->register_lock, flag);
958 }
959
960 void dmar_msi_mask(unsigned int irq)
961 {
962         unsigned long flag;
963         struct intel_iommu *iommu = get_irq_data(irq);
964
965         /* mask it */
966         spin_lock_irqsave(&iommu->register_lock, flag);
967         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
968         /* Read a reg to force flush the post write */
969         readl(iommu->reg + DMAR_FECTL_REG);
970         spin_unlock_irqrestore(&iommu->register_lock, flag);
971 }
972
973 void dmar_msi_write(int irq, struct msi_msg *msg)
974 {
975         struct intel_iommu *iommu = get_irq_data(irq);
976         unsigned long flag;
977
978         spin_lock_irqsave(&iommu->register_lock, flag);
979         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
980         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
981         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
982         spin_unlock_irqrestore(&iommu->register_lock, flag);
983 }
984
985 void dmar_msi_read(int irq, struct msi_msg *msg)
986 {
987         struct intel_iommu *iommu = get_irq_data(irq);
988         unsigned long flag;
989
990         spin_lock_irqsave(&iommu->register_lock, flag);
991         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
992         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
993         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
994         spin_unlock_irqrestore(&iommu->register_lock, flag);
995 }
996
997 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
998                 u8 fault_reason, u16 source_id, unsigned long long addr)
999 {
1000         const char *reason;
1001
1002         reason = dmar_get_fault_reason(fault_reason);
1003
1004         printk(KERN_ERR
1005                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1006                 "fault addr %llx \n"
1007                 "DMAR:[fault reason %02d] %s\n",
1008                 (type ? "DMA Read" : "DMA Write"),
1009                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1010                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1011         return 0;
1012 }
1013
1014 #define PRIMARY_FAULT_REG_LEN (16)
1015 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1016 {
1017         struct intel_iommu *iommu = dev_id;
1018         int reg, fault_index;
1019         u32 fault_status;
1020         unsigned long flag;
1021
1022         spin_lock_irqsave(&iommu->register_lock, flag);
1023         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1024
1025         /* TBD: ignore advanced fault log currently */
1026         if (!(fault_status & DMA_FSTS_PPF))
1027                 goto clear_overflow;
1028
1029         fault_index = dma_fsts_fault_record_index(fault_status);
1030         reg = cap_fault_reg_offset(iommu->cap);
1031         while (1) {
1032                 u8 fault_reason;
1033                 u16 source_id;
1034                 u64 guest_addr;
1035                 int type;
1036                 u32 data;
1037
1038                 /* highest 32 bits */
1039                 data = readl(iommu->reg + reg +
1040                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1041                 if (!(data & DMA_FRCD_F))
1042                         break;
1043
1044                 fault_reason = dma_frcd_fault_reason(data);
1045                 type = dma_frcd_type(data);
1046
1047                 data = readl(iommu->reg + reg +
1048                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1049                 source_id = dma_frcd_source_id(data);
1050
1051                 guest_addr = dmar_readq(iommu->reg + reg +
1052                                 fault_index * PRIMARY_FAULT_REG_LEN);
1053                 guest_addr = dma_frcd_page_addr(guest_addr);
1054                 /* clear the fault */
1055                 writel(DMA_FRCD_F, iommu->reg + reg +
1056                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1057
1058                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1059
1060                 iommu_page_fault_do_one(iommu, type, fault_reason,
1061                                 source_id, guest_addr);
1062
1063                 fault_index++;
1064                 if (fault_index > cap_num_fault_regs(iommu->cap))
1065                         fault_index = 0;
1066                 spin_lock_irqsave(&iommu->register_lock, flag);
1067         }
1068 clear_overflow:
1069         /* clear primary fault overflow */
1070         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1071         if (fault_status & DMA_FSTS_PFO)
1072                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1073
1074         spin_unlock_irqrestore(&iommu->register_lock, flag);
1075         return IRQ_HANDLED;
1076 }
1077
1078 int dmar_set_interrupt(struct intel_iommu *iommu)
1079 {
1080         int irq, ret;
1081
1082         irq = create_irq();
1083         if (!irq) {
1084                 printk(KERN_ERR "IOMMU: no free vectors\n");
1085                 return -EINVAL;
1086         }
1087
1088         set_irq_data(irq, iommu);
1089         iommu->irq = irq;
1090
1091         ret = arch_setup_dmar_msi(irq);
1092         if (ret) {
1093                 set_irq_data(irq, NULL);
1094                 iommu->irq = 0;
1095                 destroy_irq(irq);
1096                 return 0;
1097         }
1098
1099         /* Force fault register is cleared */
1100         iommu_page_fault(irq, iommu);
1101
1102         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1103         if (ret)
1104                 printk(KERN_ERR "IOMMU: can't request irq\n");
1105         return ret;
1106 }
1107
1108 static int iommu_init_domains(struct intel_iommu *iommu)
1109 {
1110         unsigned long ndomains;
1111         unsigned long nlongs;
1112
1113         ndomains = cap_ndoms(iommu->cap);
1114         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1115         nlongs = BITS_TO_LONGS(ndomains);
1116
1117         /* TBD: there might be 64K domains,
1118          * consider other allocation for future chip
1119          */
1120         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1121         if (!iommu->domain_ids) {
1122                 printk(KERN_ERR "Allocating domain id array failed\n");
1123                 return -ENOMEM;
1124         }
1125         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1126                         GFP_KERNEL);
1127         if (!iommu->domains) {
1128                 printk(KERN_ERR "Allocating domain array failed\n");
1129                 kfree(iommu->domain_ids);
1130                 return -ENOMEM;
1131         }
1132
1133         spin_lock_init(&iommu->lock);
1134
1135         /*
1136          * if Caching mode is set, then invalid translations are tagged
1137          * with domainid 0. Hence we need to pre-allocate it.
1138          */
1139         if (cap_caching_mode(iommu->cap))
1140                 set_bit(0, iommu->domain_ids);
1141         return 0;
1142 }
1143
1144
1145 static void domain_exit(struct dmar_domain *domain);
1146
1147 void free_dmar_iommu(struct intel_iommu *iommu)
1148 {
1149         struct dmar_domain *domain;
1150         int i;
1151
1152         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1153         for (; i < cap_ndoms(iommu->cap); ) {
1154                 domain = iommu->domains[i];
1155                 clear_bit(i, iommu->domain_ids);
1156                 domain_exit(domain);
1157                 i = find_next_bit(iommu->domain_ids,
1158                         cap_ndoms(iommu->cap), i+1);
1159         }
1160
1161         if (iommu->gcmd & DMA_GCMD_TE)
1162                 iommu_disable_translation(iommu);
1163
1164         if (iommu->irq) {
1165                 set_irq_data(iommu->irq, NULL);
1166                 /* This will mask the irq */
1167                 free_irq(iommu->irq, iommu);
1168                 destroy_irq(iommu->irq);
1169         }
1170
1171         kfree(iommu->domains);
1172         kfree(iommu->domain_ids);
1173
1174         g_iommus[iommu->seq_id] = NULL;
1175
1176         /* if all iommus are freed, free g_iommus */
1177         for (i = 0; i < g_num_of_iommus; i++) {
1178                 if (g_iommus[i])
1179                         break;
1180         }
1181
1182         if (i == g_num_of_iommus)
1183                 kfree(g_iommus);
1184
1185         /* free context mapping */
1186         free_context_table(iommu);
1187 }
1188
1189 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1190 {
1191         unsigned long num;
1192         unsigned long ndomains;
1193         struct dmar_domain *domain;
1194         unsigned long flags;
1195
1196         domain = alloc_domain_mem();
1197         if (!domain)
1198                 return NULL;
1199
1200         ndomains = cap_ndoms(iommu->cap);
1201
1202         spin_lock_irqsave(&iommu->lock, flags);
1203         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1204         if (num >= ndomains) {
1205                 spin_unlock_irqrestore(&iommu->lock, flags);
1206                 free_domain_mem(domain);
1207                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1208                 return NULL;
1209         }
1210
1211         set_bit(num, iommu->domain_ids);
1212         domain->id = num;
1213         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1214         set_bit(iommu->seq_id, &domain->iommu_bmp);
1215         domain->flags = 0;
1216         iommu->domains[num] = domain;
1217         spin_unlock_irqrestore(&iommu->lock, flags);
1218
1219         return domain;
1220 }
1221
1222 static void iommu_free_domain(struct dmar_domain *domain)
1223 {
1224         unsigned long flags;
1225         struct intel_iommu *iommu;
1226
1227         iommu = domain_get_iommu(domain);
1228
1229         spin_lock_irqsave(&iommu->lock, flags);
1230         clear_bit(domain->id, iommu->domain_ids);
1231         spin_unlock_irqrestore(&iommu->lock, flags);
1232 }
1233
1234 static struct iova_domain reserved_iova_list;
1235 static struct lock_class_key reserved_alloc_key;
1236 static struct lock_class_key reserved_rbtree_key;
1237
1238 static void dmar_init_reserved_ranges(void)
1239 {
1240         struct pci_dev *pdev = NULL;
1241         struct iova *iova;
1242         int i;
1243         u64 addr, size;
1244
1245         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1246
1247         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1248                 &reserved_alloc_key);
1249         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1250                 &reserved_rbtree_key);
1251
1252         /* IOAPIC ranges shouldn't be accessed by DMA */
1253         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1254                 IOVA_PFN(IOAPIC_RANGE_END));
1255         if (!iova)
1256                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1257
1258         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1259         for_each_pci_dev(pdev) {
1260                 struct resource *r;
1261
1262                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1263                         r = &pdev->resource[i];
1264                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1265                                 continue;
1266                         addr = r->start;
1267                         addr &= PAGE_MASK;
1268                         size = r->end - addr;
1269                         size = PAGE_ALIGN(size);
1270                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1271                                 IOVA_PFN(size + addr) - 1);
1272                         if (!iova)
1273                                 printk(KERN_ERR "Reserve iova failed\n");
1274                 }
1275         }
1276
1277 }
1278
1279 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1280 {
1281         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1282 }
1283
1284 static inline int guestwidth_to_adjustwidth(int gaw)
1285 {
1286         int agaw;
1287         int r = (gaw - 12) % 9;
1288
1289         if (r == 0)
1290                 agaw = gaw;
1291         else
1292                 agaw = gaw + 9 - r;
1293         if (agaw > 64)
1294                 agaw = 64;
1295         return agaw;
1296 }
1297
1298 static int domain_init(struct dmar_domain *domain, int guest_width)
1299 {
1300         struct intel_iommu *iommu;
1301         int adjust_width, agaw;
1302         unsigned long sagaw;
1303
1304         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1305         spin_lock_init(&domain->mapping_lock);
1306
1307         domain_reserve_special_ranges(domain);
1308
1309         /* calculate AGAW */
1310         iommu = domain_get_iommu(domain);
1311         if (guest_width > cap_mgaw(iommu->cap))
1312                 guest_width = cap_mgaw(iommu->cap);
1313         domain->gaw = guest_width;
1314         adjust_width = guestwidth_to_adjustwidth(guest_width);
1315         agaw = width_to_agaw(adjust_width);
1316         sagaw = cap_sagaw(iommu->cap);
1317         if (!test_bit(agaw, &sagaw)) {
1318                 /* hardware doesn't support it, choose a bigger one */
1319                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1320                 agaw = find_next_bit(&sagaw, 5, agaw);
1321                 if (agaw >= 5)
1322                         return -ENODEV;
1323         }
1324         domain->agaw = agaw;
1325         INIT_LIST_HEAD(&domain->devices);
1326
1327         /* always allocate the top pgd */
1328         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1329         if (!domain->pgd)
1330                 return -ENOMEM;
1331         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1332         return 0;
1333 }
1334
1335 static void domain_exit(struct dmar_domain *domain)
1336 {
1337         u64 end;
1338
1339         /* Domain 0 is reserved, so dont process it */
1340         if (!domain)
1341                 return;
1342
1343         domain_remove_dev_info(domain);
1344         /* destroy iovas */
1345         put_iova_domain(&domain->iovad);
1346         end = DOMAIN_MAX_ADDR(domain->gaw);
1347         end = end & (~PAGE_MASK);
1348
1349         /* clear ptes */
1350         dma_pte_clear_range(domain, 0, end);
1351
1352         /* free page tables */
1353         dma_pte_free_pagetable(domain, 0, end);
1354
1355         iommu_free_domain(domain);
1356         free_domain_mem(domain);
1357 }
1358
1359 static int domain_context_mapping_one(struct dmar_domain *domain,
1360                 u8 bus, u8 devfn)
1361 {
1362         struct context_entry *context;
1363         struct intel_iommu *iommu = domain_get_iommu(domain);
1364         unsigned long flags;
1365
1366         pr_debug("Set context mapping for %02x:%02x.%d\n",
1367                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1368         BUG_ON(!domain->pgd);
1369         context = device_to_context_entry(iommu, bus, devfn);
1370         if (!context)
1371                 return -ENOMEM;
1372         spin_lock_irqsave(&iommu->lock, flags);
1373         if (context_present(context)) {
1374                 spin_unlock_irqrestore(&iommu->lock, flags);
1375                 return 0;
1376         }
1377
1378         context_set_domain_id(context, domain->id);
1379         context_set_address_width(context, domain->agaw);
1380         context_set_address_root(context, virt_to_phys(domain->pgd));
1381         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1382         context_set_fault_enable(context);
1383         context_set_present(context);
1384         __iommu_flush_cache(iommu, context, sizeof(*context));
1385
1386         /* it's a non-present to present mapping */
1387         if (iommu->flush.flush_context(iommu, domain->id,
1388                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1389                 DMA_CCMD_DEVICE_INVL, 1))
1390                 iommu_flush_write_buffer(iommu);
1391         else
1392                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1393
1394         spin_unlock_irqrestore(&iommu->lock, flags);
1395         return 0;
1396 }
1397
1398 static int
1399 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1400 {
1401         int ret;
1402         struct pci_dev *tmp, *parent;
1403
1404         ret = domain_context_mapping_one(domain, pdev->bus->number,
1405                 pdev->devfn);
1406         if (ret)
1407                 return ret;
1408
1409         /* dependent device mapping */
1410         tmp = pci_find_upstream_pcie_bridge(pdev);
1411         if (!tmp)
1412                 return 0;
1413         /* Secondary interface's bus number and devfn 0 */
1414         parent = pdev->bus->self;
1415         while (parent != tmp) {
1416                 ret = domain_context_mapping_one(domain, parent->bus->number,
1417                         parent->devfn);
1418                 if (ret)
1419                         return ret;
1420                 parent = parent->bus->self;
1421         }
1422         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1423                 return domain_context_mapping_one(domain,
1424                         tmp->subordinate->number, 0);
1425         else /* this is a legacy PCI bridge */
1426                 return domain_context_mapping_one(domain,
1427                         tmp->bus->number, tmp->devfn);
1428 }
1429
1430 static int domain_context_mapped(struct dmar_domain *domain,
1431         struct pci_dev *pdev)
1432 {
1433         int ret;
1434         struct pci_dev *tmp, *parent;
1435         struct intel_iommu *iommu = domain_get_iommu(domain);
1436
1437         ret = device_context_mapped(iommu,
1438                 pdev->bus->number, pdev->devfn);
1439         if (!ret)
1440                 return ret;
1441         /* dependent device mapping */
1442         tmp = pci_find_upstream_pcie_bridge(pdev);
1443         if (!tmp)
1444                 return ret;
1445         /* Secondary interface's bus number and devfn 0 */
1446         parent = pdev->bus->self;
1447         while (parent != tmp) {
1448                 ret = device_context_mapped(iommu, parent->bus->number,
1449                         parent->devfn);
1450                 if (!ret)
1451                         return ret;
1452                 parent = parent->bus->self;
1453         }
1454         if (tmp->is_pcie)
1455                 return device_context_mapped(iommu,
1456                         tmp->subordinate->number, 0);
1457         else
1458                 return device_context_mapped(iommu,
1459                         tmp->bus->number, tmp->devfn);
1460 }
1461
1462 static int
1463 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1464                         u64 hpa, size_t size, int prot)
1465 {
1466         u64 start_pfn, end_pfn;
1467         struct dma_pte *pte;
1468         int index;
1469         int addr_width = agaw_to_width(domain->agaw);
1470         struct intel_iommu *iommu = domain_get_iommu(domain);
1471
1472         hpa &= (((u64)1) << addr_width) - 1;
1473
1474         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1475                 return -EINVAL;
1476         iova &= PAGE_MASK;
1477         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1478         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1479         index = 0;
1480         while (start_pfn < end_pfn) {
1481                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1482                 if (!pte)
1483                         return -ENOMEM;
1484                 /* We don't need lock here, nobody else
1485                  * touches the iova range
1486                  */
1487                 BUG_ON(dma_pte_addr(pte));
1488                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1489                 dma_set_pte_prot(pte, prot);
1490                 __iommu_flush_cache(iommu, pte, sizeof(*pte));
1491                 start_pfn++;
1492                 index++;
1493         }
1494         return 0;
1495 }
1496
1497 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1498 {
1499         struct intel_iommu *iommu = domain_get_iommu(domain);
1500
1501         clear_context_table(iommu, bus, devfn);
1502         iommu->flush.flush_context(iommu, 0, 0, 0,
1503                                            DMA_CCMD_GLOBAL_INVL, 0);
1504         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1505                                          DMA_TLB_GLOBAL_FLUSH, 0);
1506 }
1507
1508 static void domain_remove_dev_info(struct dmar_domain *domain)
1509 {
1510         struct device_domain_info *info;
1511         unsigned long flags;
1512
1513         spin_lock_irqsave(&device_domain_lock, flags);
1514         while (!list_empty(&domain->devices)) {
1515                 info = list_entry(domain->devices.next,
1516                         struct device_domain_info, link);
1517                 list_del(&info->link);
1518                 list_del(&info->global);
1519                 if (info->dev)
1520                         info->dev->dev.archdata.iommu = NULL;
1521                 spin_unlock_irqrestore(&device_domain_lock, flags);
1522
1523                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1524                 free_devinfo_mem(info);
1525
1526                 spin_lock_irqsave(&device_domain_lock, flags);
1527         }
1528         spin_unlock_irqrestore(&device_domain_lock, flags);
1529 }
1530
1531 /*
1532  * find_domain
1533  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1534  */
1535 static struct dmar_domain *
1536 find_domain(struct pci_dev *pdev)
1537 {
1538         struct device_domain_info *info;
1539
1540         /* No lock here, assumes no domain exit in normal case */
1541         info = pdev->dev.archdata.iommu;
1542         if (info)
1543                 return info->domain;
1544         return NULL;
1545 }
1546
1547 /* domain is initialized */
1548 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1549 {
1550         struct dmar_domain *domain, *found = NULL;
1551         struct intel_iommu *iommu;
1552         struct dmar_drhd_unit *drhd;
1553         struct device_domain_info *info, *tmp;
1554         struct pci_dev *dev_tmp;
1555         unsigned long flags;
1556         int bus = 0, devfn = 0;
1557
1558         domain = find_domain(pdev);
1559         if (domain)
1560                 return domain;
1561
1562         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1563         if (dev_tmp) {
1564                 if (dev_tmp->is_pcie) {
1565                         bus = dev_tmp->subordinate->number;
1566                         devfn = 0;
1567                 } else {
1568                         bus = dev_tmp->bus->number;
1569                         devfn = dev_tmp->devfn;
1570                 }
1571                 spin_lock_irqsave(&device_domain_lock, flags);
1572                 list_for_each_entry(info, &device_domain_list, global) {
1573                         if (info->bus == bus && info->devfn == devfn) {
1574                                 found = info->domain;
1575                                 break;
1576                         }
1577                 }
1578                 spin_unlock_irqrestore(&device_domain_lock, flags);
1579                 /* pcie-pci bridge already has a domain, uses it */
1580                 if (found) {
1581                         domain = found;
1582                         goto found_domain;
1583                 }
1584         }
1585
1586         /* Allocate new domain for the device */
1587         drhd = dmar_find_matched_drhd_unit(pdev);
1588         if (!drhd) {
1589                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1590                         pci_name(pdev));
1591                 return NULL;
1592         }
1593         iommu = drhd->iommu;
1594
1595         domain = iommu_alloc_domain(iommu);
1596         if (!domain)
1597                 goto error;
1598
1599         if (domain_init(domain, gaw)) {
1600                 domain_exit(domain);
1601                 goto error;
1602         }
1603
1604         /* register pcie-to-pci device */
1605         if (dev_tmp) {
1606                 info = alloc_devinfo_mem();
1607                 if (!info) {
1608                         domain_exit(domain);
1609                         goto error;
1610                 }
1611                 info->bus = bus;
1612                 info->devfn = devfn;
1613                 info->dev = NULL;
1614                 info->domain = domain;
1615                 /* This domain is shared by devices under p2p bridge */
1616                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1617
1618                 /* pcie-to-pci bridge already has a domain, uses it */
1619                 found = NULL;
1620                 spin_lock_irqsave(&device_domain_lock, flags);
1621                 list_for_each_entry(tmp, &device_domain_list, global) {
1622                         if (tmp->bus == bus && tmp->devfn == devfn) {
1623                                 found = tmp->domain;
1624                                 break;
1625                         }
1626                 }
1627                 if (found) {
1628                         free_devinfo_mem(info);
1629                         domain_exit(domain);
1630                         domain = found;
1631                 } else {
1632                         list_add(&info->link, &domain->devices);
1633                         list_add(&info->global, &device_domain_list);
1634                 }
1635                 spin_unlock_irqrestore(&device_domain_lock, flags);
1636         }
1637
1638 found_domain:
1639         info = alloc_devinfo_mem();
1640         if (!info)
1641                 goto error;
1642         info->bus = pdev->bus->number;
1643         info->devfn = pdev->devfn;
1644         info->dev = pdev;
1645         info->domain = domain;
1646         spin_lock_irqsave(&device_domain_lock, flags);
1647         /* somebody is fast */
1648         found = find_domain(pdev);
1649         if (found != NULL) {
1650                 spin_unlock_irqrestore(&device_domain_lock, flags);
1651                 if (found != domain) {
1652                         domain_exit(domain);
1653                         domain = found;
1654                 }
1655                 free_devinfo_mem(info);
1656                 return domain;
1657         }
1658         list_add(&info->link, &domain->devices);
1659         list_add(&info->global, &device_domain_list);
1660         pdev->dev.archdata.iommu = info;
1661         spin_unlock_irqrestore(&device_domain_lock, flags);
1662         return domain;
1663 error:
1664         /* recheck it here, maybe others set it */
1665         return find_domain(pdev);
1666 }
1667
1668 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1669                                       unsigned long long start,
1670                                       unsigned long long end)
1671 {
1672         struct dmar_domain *domain;
1673         unsigned long size;
1674         unsigned long long base;
1675         int ret;
1676
1677         printk(KERN_INFO
1678                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1679                 pci_name(pdev), start, end);
1680         /* page table init */
1681         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1682         if (!domain)
1683                 return -ENOMEM;
1684
1685         /* The address might not be aligned */
1686         base = start & PAGE_MASK;
1687         size = end - base;
1688         size = PAGE_ALIGN(size);
1689         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1690                         IOVA_PFN(base + size) - 1)) {
1691                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1692                 ret = -ENOMEM;
1693                 goto error;
1694         }
1695
1696         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1697                 size, base, pci_name(pdev));
1698         /*
1699          * RMRR range might have overlap with physical memory range,
1700          * clear it first
1701          */
1702         dma_pte_clear_range(domain, base, base + size);
1703
1704         ret = domain_page_mapping(domain, base, base, size,
1705                 DMA_PTE_READ|DMA_PTE_WRITE);
1706         if (ret)
1707                 goto error;
1708
1709         /* context entry init */
1710         ret = domain_context_mapping(domain, pdev);
1711         if (!ret)
1712                 return 0;
1713 error:
1714         domain_exit(domain);
1715         return ret;
1716
1717 }
1718
1719 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1720         struct pci_dev *pdev)
1721 {
1722         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1723                 return 0;
1724         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1725                 rmrr->end_address + 1);
1726 }
1727
1728 #ifdef CONFIG_DMAR_GFX_WA
1729 struct iommu_prepare_data {
1730         struct pci_dev *pdev;
1731         int ret;
1732 };
1733
1734 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1735                                          unsigned long end_pfn, void *datax)
1736 {
1737         struct iommu_prepare_data *data;
1738
1739         data = (struct iommu_prepare_data *)datax;
1740
1741         data->ret = iommu_prepare_identity_map(data->pdev,
1742                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1743         return data->ret;
1744
1745 }
1746
1747 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1748 {
1749         int nid;
1750         struct iommu_prepare_data data;
1751
1752         data.pdev = pdev;
1753         data.ret = 0;
1754
1755         for_each_online_node(nid) {
1756                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1757                 if (data.ret)
1758                         return data.ret;
1759         }
1760         return data.ret;
1761 }
1762
1763 static void __init iommu_prepare_gfx_mapping(void)
1764 {
1765         struct pci_dev *pdev = NULL;
1766         int ret;
1767
1768         for_each_pci_dev(pdev) {
1769                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1770                                 !IS_GFX_DEVICE(pdev))
1771                         continue;
1772                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1773                         pci_name(pdev));
1774                 ret = iommu_prepare_with_active_regions(pdev);
1775                 if (ret)
1776                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1777         }
1778 }
1779 #else /* !CONFIG_DMAR_GFX_WA */
1780 static inline void iommu_prepare_gfx_mapping(void)
1781 {
1782         return;
1783 }
1784 #endif
1785
1786 #ifdef CONFIG_DMAR_FLOPPY_WA
1787 static inline void iommu_prepare_isa(void)
1788 {
1789         struct pci_dev *pdev;
1790         int ret;
1791
1792         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1793         if (!pdev)
1794                 return;
1795
1796         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1797         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1798
1799         if (ret)
1800                 printk("IOMMU: Failed to create 0-64M identity map, "
1801                         "floppy might not work\n");
1802
1803 }
1804 #else
1805 static inline void iommu_prepare_isa(void)
1806 {
1807         return;
1808 }
1809 #endif /* !CONFIG_DMAR_FLPY_WA */
1810
1811 static int __init init_dmars(void)
1812 {
1813         struct dmar_drhd_unit *drhd;
1814         struct dmar_rmrr_unit *rmrr;
1815         struct pci_dev *pdev;
1816         struct intel_iommu *iommu;
1817         int i, ret, unit = 0;
1818
1819         /*
1820          * for each drhd
1821          *    allocate root
1822          *    initialize and program root entry to not present
1823          * endfor
1824          */
1825         for_each_drhd_unit(drhd) {
1826                 g_num_of_iommus++;
1827                 /*
1828                  * lock not needed as this is only incremented in the single
1829                  * threaded kernel __init code path all other access are read
1830                  * only
1831                  */
1832         }
1833
1834         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1835                         GFP_KERNEL);
1836         if (!g_iommus) {
1837                 printk(KERN_ERR "Allocating global iommu array failed\n");
1838                 ret = -ENOMEM;
1839                 goto error;
1840         }
1841
1842         deferred_flush = kzalloc(g_num_of_iommus *
1843                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1844         if (!deferred_flush) {
1845                 kfree(g_iommus);
1846                 ret = -ENOMEM;
1847                 goto error;
1848         }
1849
1850         for_each_drhd_unit(drhd) {
1851                 if (drhd->ignored)
1852                         continue;
1853
1854                 iommu = drhd->iommu;
1855                 g_iommus[iommu->seq_id] = iommu;
1856
1857                 ret = iommu_init_domains(iommu);
1858                 if (ret)
1859                         goto error;
1860
1861                 /*
1862                  * TBD:
1863                  * we could share the same root & context tables
1864                  * amoung all IOMMU's. Need to Split it later.
1865                  */
1866                 ret = iommu_alloc_root_entry(iommu);
1867                 if (ret) {
1868                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1869                         goto error;
1870                 }
1871         }
1872
1873         for_each_drhd_unit(drhd) {
1874                 if (drhd->ignored)
1875                         continue;
1876
1877                 iommu = drhd->iommu;
1878                 if (dmar_enable_qi(iommu)) {
1879                         /*
1880                          * Queued Invalidate not enabled, use Register Based
1881                          * Invalidate
1882                          */
1883                         iommu->flush.flush_context = __iommu_flush_context;
1884                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1885                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1886                                "invalidation\n",
1887                                (unsigned long long)drhd->reg_base_addr);
1888                 } else {
1889                         iommu->flush.flush_context = qi_flush_context;
1890                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1891                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1892                                "invalidation\n",
1893                                (unsigned long long)drhd->reg_base_addr);
1894                 }
1895         }
1896
1897         /*
1898          * For each rmrr
1899          *   for each dev attached to rmrr
1900          *   do
1901          *     locate drhd for dev, alloc domain for dev
1902          *     allocate free domain
1903          *     allocate page table entries for rmrr
1904          *     if context not allocated for bus
1905          *           allocate and init context
1906          *           set present in root table for this bus
1907          *     init context with domain, translation etc
1908          *    endfor
1909          * endfor
1910          */
1911         for_each_rmrr_units(rmrr) {
1912                 for (i = 0; i < rmrr->devices_cnt; i++) {
1913                         pdev = rmrr->devices[i];
1914                         /* some BIOS lists non-exist devices in DMAR table */
1915                         if (!pdev)
1916                                 continue;
1917                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1918                         if (ret)
1919                                 printk(KERN_ERR
1920                                  "IOMMU: mapping reserved region failed\n");
1921                 }
1922         }
1923
1924         iommu_prepare_gfx_mapping();
1925
1926         iommu_prepare_isa();
1927
1928         /*
1929          * for each drhd
1930          *   enable fault log
1931          *   global invalidate context cache
1932          *   global invalidate iotlb
1933          *   enable translation
1934          */
1935         for_each_drhd_unit(drhd) {
1936                 if (drhd->ignored)
1937                         continue;
1938                 iommu = drhd->iommu;
1939                 sprintf (iommu->name, "dmar%d", unit++);
1940
1941                 iommu_flush_write_buffer(iommu);
1942
1943                 ret = dmar_set_interrupt(iommu);
1944                 if (ret)
1945                         goto error;
1946
1947                 iommu_set_root_entry(iommu);
1948
1949                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1950                                            0);
1951                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1952                                          0);
1953                 iommu_disable_protect_mem_regions(iommu);
1954
1955                 ret = iommu_enable_translation(iommu);
1956                 if (ret)
1957                         goto error;
1958         }
1959
1960         return 0;
1961 error:
1962         for_each_drhd_unit(drhd) {
1963                 if (drhd->ignored)
1964                         continue;
1965                 iommu = drhd->iommu;
1966                 free_iommu(iommu);
1967         }
1968         kfree(g_iommus);
1969         return ret;
1970 }
1971
1972 static inline u64 aligned_size(u64 host_addr, size_t size)
1973 {
1974         u64 addr;
1975         addr = (host_addr & (~PAGE_MASK)) + size;
1976         return PAGE_ALIGN(addr);
1977 }
1978
1979 struct iova *
1980 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1981 {
1982         struct iova *piova;
1983
1984         /* Make sure it's in range */
1985         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1986         if (!size || (IOVA_START_ADDR + size > end))
1987                 return NULL;
1988
1989         piova = alloc_iova(&domain->iovad,
1990                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1991         return piova;
1992 }
1993
1994 static struct iova *
1995 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1996                    size_t size, u64 dma_mask)
1997 {
1998         struct pci_dev *pdev = to_pci_dev(dev);
1999         struct iova *iova = NULL;
2000
2001         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2002                 iova = iommu_alloc_iova(domain, size, dma_mask);
2003         else {
2004                 /*
2005                  * First try to allocate an io virtual address in
2006                  * DMA_32BIT_MASK and if that fails then try allocating
2007                  * from higher range
2008                  */
2009                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2010                 if (!iova)
2011                         iova = iommu_alloc_iova(domain, size, dma_mask);
2012         }
2013
2014         if (!iova) {
2015                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2016                 return NULL;
2017         }
2018
2019         return iova;
2020 }
2021
2022 static struct dmar_domain *
2023 get_valid_domain_for_dev(struct pci_dev *pdev)
2024 {
2025         struct dmar_domain *domain;
2026         int ret;
2027
2028         domain = get_domain_for_dev(pdev,
2029                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2030         if (!domain) {
2031                 printk(KERN_ERR
2032                         "Allocating domain for %s failed", pci_name(pdev));
2033                 return NULL;
2034         }
2035
2036         /* make sure context mapping is ok */
2037         if (unlikely(!domain_context_mapped(domain, pdev))) {
2038                 ret = domain_context_mapping(domain, pdev);
2039                 if (ret) {
2040                         printk(KERN_ERR
2041                                 "Domain context map for %s failed",
2042                                 pci_name(pdev));
2043                         return NULL;
2044                 }
2045         }
2046
2047         return domain;
2048 }
2049
2050 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2051                                      size_t size, int dir, u64 dma_mask)
2052 {
2053         struct pci_dev *pdev = to_pci_dev(hwdev);
2054         struct dmar_domain *domain;
2055         phys_addr_t start_paddr;
2056         struct iova *iova;
2057         int prot = 0;
2058         int ret;
2059         struct intel_iommu *iommu;
2060
2061         BUG_ON(dir == DMA_NONE);
2062         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2063                 return paddr;
2064
2065         domain = get_valid_domain_for_dev(pdev);
2066         if (!domain)
2067                 return 0;
2068
2069         iommu = domain_get_iommu(domain);
2070         size = aligned_size((u64)paddr, size);
2071
2072         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2073         if (!iova)
2074                 goto error;
2075
2076         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2077
2078         /*
2079          * Check if DMAR supports zero-length reads on write only
2080          * mappings..
2081          */
2082         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2083                         !cap_zlr(iommu->cap))
2084                 prot |= DMA_PTE_READ;
2085         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2086                 prot |= DMA_PTE_WRITE;
2087         /*
2088          * paddr - (paddr + size) might be partial page, we should map the whole
2089          * page.  Note: if two part of one page are separately mapped, we
2090          * might have two guest_addr mapping to the same host paddr, but this
2091          * is not a big problem
2092          */
2093         ret = domain_page_mapping(domain, start_paddr,
2094                 ((u64)paddr) & PAGE_MASK, size, prot);
2095         if (ret)
2096                 goto error;
2097
2098         /* it's a non-present to present mapping */
2099         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2100                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2101         if (ret)
2102                 iommu_flush_write_buffer(iommu);
2103
2104         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2105
2106 error:
2107         if (iova)
2108                 __free_iova(&domain->iovad, iova);
2109         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2110                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2111         return 0;
2112 }
2113
2114 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2115                             size_t size, int dir)
2116 {
2117         return __intel_map_single(hwdev, paddr, size, dir,
2118                                   to_pci_dev(hwdev)->dma_mask);
2119 }
2120
2121 static void flush_unmaps(void)
2122 {
2123         int i, j;
2124
2125         timer_on = 0;
2126
2127         /* just flush them all */
2128         for (i = 0; i < g_num_of_iommus; i++) {
2129                 struct intel_iommu *iommu = g_iommus[i];
2130                 if (!iommu)
2131                         continue;
2132
2133                 if (deferred_flush[i].next) {
2134                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2135                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2136                         for (j = 0; j < deferred_flush[i].next; j++) {
2137                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2138                                                 deferred_flush[i].iova[j]);
2139                         }
2140                         deferred_flush[i].next = 0;
2141                 }
2142         }
2143
2144         list_size = 0;
2145 }
2146
2147 static void flush_unmaps_timeout(unsigned long data)
2148 {
2149         unsigned long flags;
2150
2151         spin_lock_irqsave(&async_umap_flush_lock, flags);
2152         flush_unmaps();
2153         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2154 }
2155
2156 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2157 {
2158         unsigned long flags;
2159         int next, iommu_id;
2160         struct intel_iommu *iommu;
2161
2162         spin_lock_irqsave(&async_umap_flush_lock, flags);
2163         if (list_size == HIGH_WATER_MARK)
2164                 flush_unmaps();
2165
2166         iommu = domain_get_iommu(dom);
2167         iommu_id = iommu->seq_id;
2168
2169         next = deferred_flush[iommu_id].next;
2170         deferred_flush[iommu_id].domain[next] = dom;
2171         deferred_flush[iommu_id].iova[next] = iova;
2172         deferred_flush[iommu_id].next++;
2173
2174         if (!timer_on) {
2175                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2176                 timer_on = 1;
2177         }
2178         list_size++;
2179         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2180 }
2181
2182 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2183                         int dir)
2184 {
2185         struct pci_dev *pdev = to_pci_dev(dev);
2186         struct dmar_domain *domain;
2187         unsigned long start_addr;
2188         struct iova *iova;
2189         struct intel_iommu *iommu;
2190
2191         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2192                 return;
2193         domain = find_domain(pdev);
2194         BUG_ON(!domain);
2195
2196         iommu = domain_get_iommu(domain);
2197
2198         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2199         if (!iova)
2200                 return;
2201
2202         start_addr = iova->pfn_lo << PAGE_SHIFT;
2203         size = aligned_size((u64)dev_addr, size);
2204
2205         pr_debug("Device %s unmapping: %lx@%llx\n",
2206                 pci_name(pdev), size, (unsigned long long)start_addr);
2207
2208         /*  clear the whole page */
2209         dma_pte_clear_range(domain, start_addr, start_addr + size);
2210         /* free page tables */
2211         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2212         if (intel_iommu_strict) {
2213                 if (iommu_flush_iotlb_psi(iommu,
2214                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2215                         iommu_flush_write_buffer(iommu);
2216                 /* free iova */
2217                 __free_iova(&domain->iovad, iova);
2218         } else {
2219                 add_unmap(domain, iova);
2220                 /*
2221                  * queue up the release of the unmap to save the 1/6th of the
2222                  * cpu used up by the iotlb flush operation...
2223                  */
2224         }
2225 }
2226
2227 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2228                            dma_addr_t *dma_handle, gfp_t flags)
2229 {
2230         void *vaddr;
2231         int order;
2232
2233         size = PAGE_ALIGN(size);
2234         order = get_order(size);
2235         flags &= ~(GFP_DMA | GFP_DMA32);
2236
2237         vaddr = (void *)__get_free_pages(flags, order);
2238         if (!vaddr)
2239                 return NULL;
2240         memset(vaddr, 0, size);
2241
2242         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2243                                          DMA_BIDIRECTIONAL,
2244                                          hwdev->coherent_dma_mask);
2245         if (*dma_handle)
2246                 return vaddr;
2247         free_pages((unsigned long)vaddr, order);
2248         return NULL;
2249 }
2250
2251 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2252                          dma_addr_t dma_handle)
2253 {
2254         int order;
2255
2256         size = PAGE_ALIGN(size);
2257         order = get_order(size);
2258
2259         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2260         free_pages((unsigned long)vaddr, order);
2261 }
2262
2263 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2264
2265 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2266                     int nelems, int dir)
2267 {
2268         int i;
2269         struct pci_dev *pdev = to_pci_dev(hwdev);
2270         struct dmar_domain *domain;
2271         unsigned long start_addr;
2272         struct iova *iova;
2273         size_t size = 0;
2274         void *addr;
2275         struct scatterlist *sg;
2276         struct intel_iommu *iommu;
2277
2278         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2279                 return;
2280
2281         domain = find_domain(pdev);
2282         BUG_ON(!domain);
2283
2284         iommu = domain_get_iommu(domain);
2285
2286         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2287         if (!iova)
2288                 return;
2289         for_each_sg(sglist, sg, nelems, i) {
2290                 addr = SG_ENT_VIRT_ADDRESS(sg);
2291                 size += aligned_size((u64)addr, sg->length);
2292         }
2293
2294         start_addr = iova->pfn_lo << PAGE_SHIFT;
2295
2296         /*  clear the whole page */
2297         dma_pte_clear_range(domain, start_addr, start_addr + size);
2298         /* free page tables */
2299         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2300
2301         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2302                         size >> VTD_PAGE_SHIFT, 0))
2303                 iommu_flush_write_buffer(iommu);
2304
2305         /* free iova */
2306         __free_iova(&domain->iovad, iova);
2307 }
2308
2309 static int intel_nontranslate_map_sg(struct device *hddev,
2310         struct scatterlist *sglist, int nelems, int dir)
2311 {
2312         int i;
2313         struct scatterlist *sg;
2314
2315         for_each_sg(sglist, sg, nelems, i) {
2316                 BUG_ON(!sg_page(sg));
2317                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2318                 sg->dma_length = sg->length;
2319         }
2320         return nelems;
2321 }
2322
2323 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2324                  int dir)
2325 {
2326         void *addr;
2327         int i;
2328         struct pci_dev *pdev = to_pci_dev(hwdev);
2329         struct dmar_domain *domain;
2330         size_t size = 0;
2331         int prot = 0;
2332         size_t offset = 0;
2333         struct iova *iova = NULL;
2334         int ret;
2335         struct scatterlist *sg;
2336         unsigned long start_addr;
2337         struct intel_iommu *iommu;
2338
2339         BUG_ON(dir == DMA_NONE);
2340         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2341                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2342
2343         domain = get_valid_domain_for_dev(pdev);
2344         if (!domain)
2345                 return 0;
2346
2347         iommu = domain_get_iommu(domain);
2348
2349         for_each_sg(sglist, sg, nelems, i) {
2350                 addr = SG_ENT_VIRT_ADDRESS(sg);
2351                 addr = (void *)virt_to_phys(addr);
2352                 size += aligned_size((u64)addr, sg->length);
2353         }
2354
2355         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2356         if (!iova) {
2357                 sglist->dma_length = 0;
2358                 return 0;
2359         }
2360
2361         /*
2362          * Check if DMAR supports zero-length reads on write only
2363          * mappings..
2364          */
2365         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2366                         !cap_zlr(iommu->cap))
2367                 prot |= DMA_PTE_READ;
2368         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2369                 prot |= DMA_PTE_WRITE;
2370
2371         start_addr = iova->pfn_lo << PAGE_SHIFT;
2372         offset = 0;
2373         for_each_sg(sglist, sg, nelems, i) {
2374                 addr = SG_ENT_VIRT_ADDRESS(sg);
2375                 addr = (void *)virt_to_phys(addr);
2376                 size = aligned_size((u64)addr, sg->length);
2377                 ret = domain_page_mapping(domain, start_addr + offset,
2378                         ((u64)addr) & PAGE_MASK,
2379                         size, prot);
2380                 if (ret) {
2381                         /*  clear the page */
2382                         dma_pte_clear_range(domain, start_addr,
2383                                   start_addr + offset);
2384                         /* free page tables */
2385                         dma_pte_free_pagetable(domain, start_addr,
2386                                   start_addr + offset);
2387                         /* free iova */
2388                         __free_iova(&domain->iovad, iova);
2389                         return 0;
2390                 }
2391                 sg->dma_address = start_addr + offset +
2392                                 ((u64)addr & (~PAGE_MASK));
2393                 sg->dma_length = sg->length;
2394                 offset += size;
2395         }
2396
2397         /* it's a non-present to present mapping */
2398         if (iommu_flush_iotlb_psi(iommu, domain->id,
2399                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2400                 iommu_flush_write_buffer(iommu);
2401         return nelems;
2402 }
2403
2404 static struct dma_mapping_ops intel_dma_ops = {
2405         .alloc_coherent = intel_alloc_coherent,
2406         .free_coherent = intel_free_coherent,
2407         .map_single = intel_map_single,
2408         .unmap_single = intel_unmap_single,
2409         .map_sg = intel_map_sg,
2410         .unmap_sg = intel_unmap_sg,
2411 };
2412
2413 static inline int iommu_domain_cache_init(void)
2414 {
2415         int ret = 0;
2416
2417         iommu_domain_cache = kmem_cache_create("iommu_domain",
2418                                          sizeof(struct dmar_domain),
2419                                          0,
2420                                          SLAB_HWCACHE_ALIGN,
2421
2422                                          NULL);
2423         if (!iommu_domain_cache) {
2424                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2425                 ret = -ENOMEM;
2426         }
2427
2428         return ret;
2429 }
2430
2431 static inline int iommu_devinfo_cache_init(void)
2432 {
2433         int ret = 0;
2434
2435         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2436                                          sizeof(struct device_domain_info),
2437                                          0,
2438                                          SLAB_HWCACHE_ALIGN,
2439                                          NULL);
2440         if (!iommu_devinfo_cache) {
2441                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2442                 ret = -ENOMEM;
2443         }
2444
2445         return ret;
2446 }
2447
2448 static inline int iommu_iova_cache_init(void)
2449 {
2450         int ret = 0;
2451
2452         iommu_iova_cache = kmem_cache_create("iommu_iova",
2453                                          sizeof(struct iova),
2454                                          0,
2455                                          SLAB_HWCACHE_ALIGN,
2456                                          NULL);
2457         if (!iommu_iova_cache) {
2458                 printk(KERN_ERR "Couldn't create iova cache\n");
2459                 ret = -ENOMEM;
2460         }
2461
2462         return ret;
2463 }
2464
2465 static int __init iommu_init_mempool(void)
2466 {
2467         int ret;
2468         ret = iommu_iova_cache_init();
2469         if (ret)
2470                 return ret;
2471
2472         ret = iommu_domain_cache_init();
2473         if (ret)
2474                 goto domain_error;
2475
2476         ret = iommu_devinfo_cache_init();
2477         if (!ret)
2478                 return ret;
2479
2480         kmem_cache_destroy(iommu_domain_cache);
2481 domain_error:
2482         kmem_cache_destroy(iommu_iova_cache);
2483
2484         return -ENOMEM;
2485 }
2486
2487 static void __init iommu_exit_mempool(void)
2488 {
2489         kmem_cache_destroy(iommu_devinfo_cache);
2490         kmem_cache_destroy(iommu_domain_cache);
2491         kmem_cache_destroy(iommu_iova_cache);
2492
2493 }
2494
2495 static void __init init_no_remapping_devices(void)
2496 {
2497         struct dmar_drhd_unit *drhd;
2498
2499         for_each_drhd_unit(drhd) {
2500                 if (!drhd->include_all) {
2501                         int i;
2502                         for (i = 0; i < drhd->devices_cnt; i++)
2503                                 if (drhd->devices[i] != NULL)
2504                                         break;
2505                         /* ignore DMAR unit if no pci devices exist */
2506                         if (i == drhd->devices_cnt)
2507                                 drhd->ignored = 1;
2508                 }
2509         }
2510
2511         if (dmar_map_gfx)
2512                 return;
2513
2514         for_each_drhd_unit(drhd) {
2515                 int i;
2516                 if (drhd->ignored || drhd->include_all)
2517                         continue;
2518
2519                 for (i = 0; i < drhd->devices_cnt; i++)
2520                         if (drhd->devices[i] &&
2521                                 !IS_GFX_DEVICE(drhd->devices[i]))
2522                                 break;
2523
2524                 if (i < drhd->devices_cnt)
2525                         continue;
2526
2527                 /* bypass IOMMU if it is just for gfx devices */
2528                 drhd->ignored = 1;
2529                 for (i = 0; i < drhd->devices_cnt; i++) {
2530                         if (!drhd->devices[i])
2531                                 continue;
2532                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2533                 }
2534         }
2535 }
2536
2537 int __init intel_iommu_init(void)
2538 {
2539         int ret = 0;
2540
2541         if (dmar_table_init())
2542                 return  -ENODEV;
2543
2544         if (dmar_dev_scope_init())
2545                 return  -ENODEV;
2546
2547         /*
2548          * Check the need for DMA-remapping initialization now.
2549          * Above initialization will also be used by Interrupt-remapping.
2550          */
2551         if (no_iommu || swiotlb || dmar_disabled)
2552                 return -ENODEV;
2553
2554         iommu_init_mempool();
2555         dmar_init_reserved_ranges();
2556
2557         init_no_remapping_devices();
2558
2559         ret = init_dmars();
2560         if (ret) {
2561                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2562                 put_iova_domain(&reserved_iova_list);
2563                 iommu_exit_mempool();
2564                 return ret;
2565         }
2566         printk(KERN_INFO
2567         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2568
2569         init_timer(&unmap_timer);
2570         force_iommu = 1;
2571         dma_ops = &intel_dma_ops;
2572         return 0;
2573 }
2574
2575 void intel_iommu_domain_exit(struct dmar_domain *domain)
2576 {
2577         u64 end;
2578
2579         /* Domain 0 is reserved, so dont process it */
2580         if (!domain)
2581                 return;
2582
2583         end = DOMAIN_MAX_ADDR(domain->gaw);
2584         end = end & (~VTD_PAGE_MASK);
2585
2586         /* clear ptes */
2587         dma_pte_clear_range(domain, 0, end);
2588
2589         /* free page tables */
2590         dma_pte_free_pagetable(domain, 0, end);
2591
2592         iommu_free_domain(domain);
2593         free_domain_mem(domain);
2594 }
2595 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2596
2597 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2598 {
2599         struct dmar_drhd_unit *drhd;
2600         struct dmar_domain *domain;
2601         struct intel_iommu *iommu;
2602
2603         drhd = dmar_find_matched_drhd_unit(pdev);
2604         if (!drhd) {
2605                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2606                 return NULL;
2607         }
2608
2609         iommu = drhd->iommu;
2610         if (!iommu) {
2611                 printk(KERN_ERR
2612                         "intel_iommu_domain_alloc: iommu == NULL\n");
2613                 return NULL;
2614         }
2615         domain = iommu_alloc_domain(iommu);
2616         if (!domain) {
2617                 printk(KERN_ERR
2618                         "intel_iommu_domain_alloc: domain == NULL\n");
2619                 return NULL;
2620         }
2621         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2622                 printk(KERN_ERR
2623                         "intel_iommu_domain_alloc: domain_init() failed\n");
2624                 intel_iommu_domain_exit(domain);
2625                 return NULL;
2626         }
2627         return domain;
2628 }
2629 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2630
2631 int intel_iommu_context_mapping(
2632         struct dmar_domain *domain, struct pci_dev *pdev)
2633 {
2634         int rc;
2635         rc = domain_context_mapping(domain, pdev);
2636         return rc;
2637 }
2638 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2639
2640 int intel_iommu_page_mapping(
2641         struct dmar_domain *domain, dma_addr_t iova,
2642         u64 hpa, size_t size, int prot)
2643 {
2644         int rc;
2645         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2646         return rc;
2647 }
2648 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2649
2650 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2651 {
2652         detach_domain_for_dev(domain, bus, devfn);
2653 }
2654 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2655
2656 struct dmar_domain *
2657 intel_iommu_find_domain(struct pci_dev *pdev)
2658 {
2659         return find_domain(pdev);
2660 }
2661 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2662
2663 int intel_iommu_found(void)
2664 {
2665         return g_num_of_iommus;
2666 }
2667 EXPORT_SYMBOL_GPL(intel_iommu_found);
2668
2669 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2670 {
2671         struct dma_pte *pte;
2672         u64 pfn;
2673
2674         pfn = 0;
2675         pte = addr_to_dma_pte(domain, iova);
2676
2677         if (pte)
2678                 pfn = dma_pte_addr(pte);
2679
2680         return pfn >> VTD_PAGE_SHIFT;
2681 }
2682 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);