]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/pci/intel-iommu.c
Allocation and free functions of virtual machine domain
[linux-2.6-omap-h63xx.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
62
63 /*
64  * 0: Present
65  * 1-11: Reserved
66  * 12-63: Context Ptr (12 - (haw-1))
67  * 64-127: Reserved
68  */
69 struct root_entry {
70         u64     val;
71         u64     rsvd1;
72 };
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
75 {
76         return (root->val & 1);
77 }
78 static inline void set_root_present(struct root_entry *root)
79 {
80         root->val |= 1;
81 }
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
83 {
84         root->val |= value & VTD_PAGE_MASK;
85 }
86
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
89 {
90         return (struct context_entry *)
91                 (root_present(root)?phys_to_virt(
92                 root->val & VTD_PAGE_MASK) :
93                 NULL);
94 }
95
96 /*
97  * low 64 bits:
98  * 0: present
99  * 1: fault processing disable
100  * 2-3: translation type
101  * 12-63: address space root
102  * high 64 bits:
103  * 0-2: address width
104  * 3-6: aval
105  * 8-23: domain id
106  */
107 struct context_entry {
108         u64 lo;
109         u64 hi;
110 };
111
112 static inline bool context_present(struct context_entry *context)
113 {
114         return (context->lo & 1);
115 }
116 static inline void context_set_present(struct context_entry *context)
117 {
118         context->lo |= 1;
119 }
120
121 static inline void context_set_fault_enable(struct context_entry *context)
122 {
123         context->lo &= (((u64)-1) << 2) | 1;
124 }
125
126 #define CONTEXT_TT_MULTI_LEVEL 0
127
128 static inline void context_set_translation_type(struct context_entry *context,
129                                                 unsigned long value)
130 {
131         context->lo &= (((u64)-1) << 4) | 3;
132         context->lo |= (value & 3) << 2;
133 }
134
135 static inline void context_set_address_root(struct context_entry *context,
136                                             unsigned long value)
137 {
138         context->lo |= value & VTD_PAGE_MASK;
139 }
140
141 static inline void context_set_address_width(struct context_entry *context,
142                                              unsigned long value)
143 {
144         context->hi |= value & 7;
145 }
146
147 static inline void context_set_domain_id(struct context_entry *context,
148                                          unsigned long value)
149 {
150         context->hi |= (value & ((1 << 16) - 1)) << 8;
151 }
152
153 static inline void context_clear_entry(struct context_entry *context)
154 {
155         context->lo = 0;
156         context->hi = 0;
157 }
158
159 /*
160  * 0: readable
161  * 1: writable
162  * 2-6: reserved
163  * 7: super page
164  * 8-11: available
165  * 12-63: Host physcial address
166  */
167 struct dma_pte {
168         u64 val;
169 };
170
171 static inline void dma_clear_pte(struct dma_pte *pte)
172 {
173         pte->val = 0;
174 }
175
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
177 {
178         pte->val |= DMA_PTE_READ;
179 }
180
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_WRITE;
184 }
185
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
187 {
188         pte->val = (pte->val & ~3) | (prot & 3);
189 }
190
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
192 {
193         return (pte->val & VTD_PAGE_MASK);
194 }
195
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
197 {
198         pte->val |= (addr & VTD_PAGE_MASK);
199 }
200
201 static inline bool dma_pte_present(struct dma_pte *pte)
202 {
203         return (pte->val & 3) != 0;
204 }
205
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
208
209 /* domain represents a virtual machine, more than one devices
210  * across iommus may be owned in one domain, e.g. kvm guest.
211  */
212 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
213
214 struct dmar_domain {
215         int     id;                     /* domain id */
216         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
217
218         struct list_head devices;       /* all devices' list */
219         struct iova_domain iovad;       /* iova's that belong to this domain */
220
221         struct dma_pte  *pgd;           /* virtual address */
222         spinlock_t      mapping_lock;   /* page table lock */
223         int             gaw;            /* max guest address width */
224
225         /* adjusted guest address width, 0 is level 2 30-bit */
226         int             agaw;
227
228         int             flags;          /* flags to find out type of domain */
229
230         int             iommu_coherency;/* indicate coherency of iommu access */
231         int             iommu_count;    /* reference count of iommu */
232         spinlock_t      iommu_lock;     /* protect iommu set in domain */
233 };
234
235 /* PCI domain-device relationship */
236 struct device_domain_info {
237         struct list_head link;  /* link to domain siblings */
238         struct list_head global; /* link to global list */
239         u8 bus;                 /* PCI bus numer */
240         u8 devfn;               /* PCI devfn number */
241         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
242         struct dmar_domain *domain; /* pointer to domain */
243 };
244
245 static void flush_unmaps_timeout(unsigned long data);
246
247 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
248
249 #define HIGH_WATER_MARK 250
250 struct deferred_flush_tables {
251         int next;
252         struct iova *iova[HIGH_WATER_MARK];
253         struct dmar_domain *domain[HIGH_WATER_MARK];
254 };
255
256 static struct deferred_flush_tables *deferred_flush;
257
258 /* bitmap for indexing intel_iommus */
259 static int g_num_of_iommus;
260
261 static DEFINE_SPINLOCK(async_umap_flush_lock);
262 static LIST_HEAD(unmaps_to_do);
263
264 static int timer_on;
265 static long list_size;
266
267 static void domain_remove_dev_info(struct dmar_domain *domain);
268
269 int dmar_disabled;
270 static int __initdata dmar_map_gfx = 1;
271 static int dmar_forcedac;
272 static int intel_iommu_strict;
273
274 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
275 static DEFINE_SPINLOCK(device_domain_lock);
276 static LIST_HEAD(device_domain_list);
277
278 static int __init intel_iommu_setup(char *str)
279 {
280         if (!str)
281                 return -EINVAL;
282         while (*str) {
283                 if (!strncmp(str, "off", 3)) {
284                         dmar_disabled = 1;
285                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
286                 } else if (!strncmp(str, "igfx_off", 8)) {
287                         dmar_map_gfx = 0;
288                         printk(KERN_INFO
289                                 "Intel-IOMMU: disable GFX device mapping\n");
290                 } else if (!strncmp(str, "forcedac", 8)) {
291                         printk(KERN_INFO
292                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
293                         dmar_forcedac = 1;
294                 } else if (!strncmp(str, "strict", 6)) {
295                         printk(KERN_INFO
296                                 "Intel-IOMMU: disable batched IOTLB flush\n");
297                         intel_iommu_strict = 1;
298                 }
299
300                 str += strcspn(str, ",");
301                 while (*str == ',')
302                         str++;
303         }
304         return 0;
305 }
306 __setup("intel_iommu=", intel_iommu_setup);
307
308 static struct kmem_cache *iommu_domain_cache;
309 static struct kmem_cache *iommu_devinfo_cache;
310 static struct kmem_cache *iommu_iova_cache;
311
312 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
313 {
314         unsigned int flags;
315         void *vaddr;
316
317         /* trying to avoid low memory issues */
318         flags = current->flags & PF_MEMALLOC;
319         current->flags |= PF_MEMALLOC;
320         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
321         current->flags &= (~PF_MEMALLOC | flags);
322         return vaddr;
323 }
324
325
326 static inline void *alloc_pgtable_page(void)
327 {
328         unsigned int flags;
329         void *vaddr;
330
331         /* trying to avoid low memory issues */
332         flags = current->flags & PF_MEMALLOC;
333         current->flags |= PF_MEMALLOC;
334         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
335         current->flags &= (~PF_MEMALLOC | flags);
336         return vaddr;
337 }
338
339 static inline void free_pgtable_page(void *vaddr)
340 {
341         free_page((unsigned long)vaddr);
342 }
343
344 static inline void *alloc_domain_mem(void)
345 {
346         return iommu_kmem_cache_alloc(iommu_domain_cache);
347 }
348
349 static void free_domain_mem(void *vaddr)
350 {
351         kmem_cache_free(iommu_domain_cache, vaddr);
352 }
353
354 static inline void * alloc_devinfo_mem(void)
355 {
356         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
357 }
358
359 static inline void free_devinfo_mem(void *vaddr)
360 {
361         kmem_cache_free(iommu_devinfo_cache, vaddr);
362 }
363
364 struct iova *alloc_iova_mem(void)
365 {
366         return iommu_kmem_cache_alloc(iommu_iova_cache);
367 }
368
369 void free_iova_mem(struct iova *iova)
370 {
371         kmem_cache_free(iommu_iova_cache, iova);
372 }
373
374
375 static inline int width_to_agaw(int width);
376
377 /* calculate agaw for each iommu.
378  * "SAGAW" may be different across iommus, use a default agaw, and
379  * get a supported less agaw for iommus that don't support the default agaw.
380  */
381 int iommu_calculate_agaw(struct intel_iommu *iommu)
382 {
383         unsigned long sagaw;
384         int agaw = -1;
385
386         sagaw = cap_sagaw(iommu->cap);
387         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
388              agaw >= 0; agaw--) {
389                 if (test_bit(agaw, &sagaw))
390                         break;
391         }
392
393         return agaw;
394 }
395
396 /* in native case, each domain is related to only one iommu */
397 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
398 {
399         int iommu_id;
400
401         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
402
403         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
404         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
405                 return NULL;
406
407         return g_iommus[iommu_id];
408 }
409
410 /* "Coherency" capability may be different across iommus */
411 static void domain_update_iommu_coherency(struct dmar_domain *domain)
412 {
413         int i;
414
415         domain->iommu_coherency = 1;
416
417         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
418         for (; i < g_num_of_iommus; ) {
419                 if (!ecap_coherent(g_iommus[i]->ecap)) {
420                         domain->iommu_coherency = 0;
421                         break;
422                 }
423                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
424         }
425 }
426
427 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
428 {
429         struct dmar_drhd_unit *drhd = NULL;
430         int i;
431
432         for_each_drhd_unit(drhd) {
433                 if (drhd->ignored)
434                         continue;
435
436                 for (i = 0; i < drhd->devices_cnt; i++)
437                         if (drhd->devices[i]->bus->number == bus &&
438                             drhd->devices[i]->devfn == devfn)
439                                 return drhd->iommu;
440
441                 if (drhd->include_all)
442                         return drhd->iommu;
443         }
444
445         return NULL;
446 }
447
448 static void domain_flush_cache(struct dmar_domain *domain,
449                                void *addr, int size)
450 {
451         if (!domain->iommu_coherency)
452                 clflush_cache_range(addr, size);
453 }
454
455 /* Gets context entry for a given bus and devfn */
456 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
457                 u8 bus, u8 devfn)
458 {
459         struct root_entry *root;
460         struct context_entry *context;
461         unsigned long phy_addr;
462         unsigned long flags;
463
464         spin_lock_irqsave(&iommu->lock, flags);
465         root = &iommu->root_entry[bus];
466         context = get_context_addr_from_root(root);
467         if (!context) {
468                 context = (struct context_entry *)alloc_pgtable_page();
469                 if (!context) {
470                         spin_unlock_irqrestore(&iommu->lock, flags);
471                         return NULL;
472                 }
473                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
474                 phy_addr = virt_to_phys((void *)context);
475                 set_root_value(root, phy_addr);
476                 set_root_present(root);
477                 __iommu_flush_cache(iommu, root, sizeof(*root));
478         }
479         spin_unlock_irqrestore(&iommu->lock, flags);
480         return &context[devfn];
481 }
482
483 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
484 {
485         struct root_entry *root;
486         struct context_entry *context;
487         int ret;
488         unsigned long flags;
489
490         spin_lock_irqsave(&iommu->lock, flags);
491         root = &iommu->root_entry[bus];
492         context = get_context_addr_from_root(root);
493         if (!context) {
494                 ret = 0;
495                 goto out;
496         }
497         ret = context_present(&context[devfn]);
498 out:
499         spin_unlock_irqrestore(&iommu->lock, flags);
500         return ret;
501 }
502
503 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
504 {
505         struct root_entry *root;
506         struct context_entry *context;
507         unsigned long flags;
508
509         spin_lock_irqsave(&iommu->lock, flags);
510         root = &iommu->root_entry[bus];
511         context = get_context_addr_from_root(root);
512         if (context) {
513                 context_clear_entry(&context[devfn]);
514                 __iommu_flush_cache(iommu, &context[devfn], \
515                         sizeof(*context));
516         }
517         spin_unlock_irqrestore(&iommu->lock, flags);
518 }
519
520 static void free_context_table(struct intel_iommu *iommu)
521 {
522         struct root_entry *root;
523         int i;
524         unsigned long flags;
525         struct context_entry *context;
526
527         spin_lock_irqsave(&iommu->lock, flags);
528         if (!iommu->root_entry) {
529                 goto out;
530         }
531         for (i = 0; i < ROOT_ENTRY_NR; i++) {
532                 root = &iommu->root_entry[i];
533                 context = get_context_addr_from_root(root);
534                 if (context)
535                         free_pgtable_page(context);
536         }
537         free_pgtable_page(iommu->root_entry);
538         iommu->root_entry = NULL;
539 out:
540         spin_unlock_irqrestore(&iommu->lock, flags);
541 }
542
543 /* page table handling */
544 #define LEVEL_STRIDE            (9)
545 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
546
547 static inline int agaw_to_level(int agaw)
548 {
549         return agaw + 2;
550 }
551
552 static inline int agaw_to_width(int agaw)
553 {
554         return 30 + agaw * LEVEL_STRIDE;
555
556 }
557
558 static inline int width_to_agaw(int width)
559 {
560         return (width - 30) / LEVEL_STRIDE;
561 }
562
563 static inline unsigned int level_to_offset_bits(int level)
564 {
565         return (12 + (level - 1) * LEVEL_STRIDE);
566 }
567
568 static inline int address_level_offset(u64 addr, int level)
569 {
570         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
571 }
572
573 static inline u64 level_mask(int level)
574 {
575         return ((u64)-1 << level_to_offset_bits(level));
576 }
577
578 static inline u64 level_size(int level)
579 {
580         return ((u64)1 << level_to_offset_bits(level));
581 }
582
583 static inline u64 align_to_level(u64 addr, int level)
584 {
585         return ((addr + level_size(level) - 1) & level_mask(level));
586 }
587
588 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
589 {
590         int addr_width = agaw_to_width(domain->agaw);
591         struct dma_pte *parent, *pte = NULL;
592         int level = agaw_to_level(domain->agaw);
593         int offset;
594         unsigned long flags;
595
596         BUG_ON(!domain->pgd);
597
598         addr &= (((u64)1) << addr_width) - 1;
599         parent = domain->pgd;
600
601         spin_lock_irqsave(&domain->mapping_lock, flags);
602         while (level > 0) {
603                 void *tmp_page;
604
605                 offset = address_level_offset(addr, level);
606                 pte = &parent[offset];
607                 if (level == 1)
608                         break;
609
610                 if (!dma_pte_present(pte)) {
611                         tmp_page = alloc_pgtable_page();
612
613                         if (!tmp_page) {
614                                 spin_unlock_irqrestore(&domain->mapping_lock,
615                                         flags);
616                                 return NULL;
617                         }
618                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
619                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
620                         /*
621                          * high level table always sets r/w, last level page
622                          * table control read/write
623                          */
624                         dma_set_pte_readable(pte);
625                         dma_set_pte_writable(pte);
626                         domain_flush_cache(domain, pte, sizeof(*pte));
627                 }
628                 parent = phys_to_virt(dma_pte_addr(pte));
629                 level--;
630         }
631
632         spin_unlock_irqrestore(&domain->mapping_lock, flags);
633         return pte;
634 }
635
636 /* return address's pte at specific level */
637 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
638                 int level)
639 {
640         struct dma_pte *parent, *pte = NULL;
641         int total = agaw_to_level(domain->agaw);
642         int offset;
643
644         parent = domain->pgd;
645         while (level <= total) {
646                 offset = address_level_offset(addr, total);
647                 pte = &parent[offset];
648                 if (level == total)
649                         return pte;
650
651                 if (!dma_pte_present(pte))
652                         break;
653                 parent = phys_to_virt(dma_pte_addr(pte));
654                 total--;
655         }
656         return NULL;
657 }
658
659 /* clear one page's page table */
660 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
661 {
662         struct dma_pte *pte = NULL;
663
664         /* get last level pte */
665         pte = dma_addr_level_pte(domain, addr, 1);
666
667         if (pte) {
668                 dma_clear_pte(pte);
669                 domain_flush_cache(domain, pte, sizeof(*pte));
670         }
671 }
672
673 /* clear last level pte, a tlb flush should be followed */
674 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
675 {
676         int addr_width = agaw_to_width(domain->agaw);
677
678         start &= (((u64)1) << addr_width) - 1;
679         end &= (((u64)1) << addr_width) - 1;
680         /* in case it's partial page */
681         start = PAGE_ALIGN(start);
682         end &= PAGE_MASK;
683
684         /* we don't need lock here, nobody else touches the iova range */
685         while (start < end) {
686                 dma_pte_clear_one(domain, start);
687                 start += VTD_PAGE_SIZE;
688         }
689 }
690
691 /* free page table pages. last level pte should already be cleared */
692 static void dma_pte_free_pagetable(struct dmar_domain *domain,
693         u64 start, u64 end)
694 {
695         int addr_width = agaw_to_width(domain->agaw);
696         struct dma_pte *pte;
697         int total = agaw_to_level(domain->agaw);
698         int level;
699         u64 tmp;
700
701         start &= (((u64)1) << addr_width) - 1;
702         end &= (((u64)1) << addr_width) - 1;
703
704         /* we don't need lock here, nobody else touches the iova range */
705         level = 2;
706         while (level <= total) {
707                 tmp = align_to_level(start, level);
708                 if (tmp >= end || (tmp + level_size(level) > end))
709                         return;
710
711                 while (tmp < end) {
712                         pte = dma_addr_level_pte(domain, tmp, level);
713                         if (pte) {
714                                 free_pgtable_page(
715                                         phys_to_virt(dma_pte_addr(pte)));
716                                 dma_clear_pte(pte);
717                                 domain_flush_cache(domain, pte, sizeof(*pte));
718                         }
719                         tmp += level_size(level);
720                 }
721                 level++;
722         }
723         /* free pgd */
724         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
725                 free_pgtable_page(domain->pgd);
726                 domain->pgd = NULL;
727         }
728 }
729
730 /* iommu handling */
731 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
732 {
733         struct root_entry *root;
734         unsigned long flags;
735
736         root = (struct root_entry *)alloc_pgtable_page();
737         if (!root)
738                 return -ENOMEM;
739
740         __iommu_flush_cache(iommu, root, ROOT_SIZE);
741
742         spin_lock_irqsave(&iommu->lock, flags);
743         iommu->root_entry = root;
744         spin_unlock_irqrestore(&iommu->lock, flags);
745
746         return 0;
747 }
748
749 static void iommu_set_root_entry(struct intel_iommu *iommu)
750 {
751         void *addr;
752         u32 cmd, sts;
753         unsigned long flag;
754
755         addr = iommu->root_entry;
756
757         spin_lock_irqsave(&iommu->register_lock, flag);
758         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
759
760         cmd = iommu->gcmd | DMA_GCMD_SRTP;
761         writel(cmd, iommu->reg + DMAR_GCMD_REG);
762
763         /* Make sure hardware complete it */
764         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
765                 readl, (sts & DMA_GSTS_RTPS), sts);
766
767         spin_unlock_irqrestore(&iommu->register_lock, flag);
768 }
769
770 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
771 {
772         u32 val;
773         unsigned long flag;
774
775         if (!cap_rwbf(iommu->cap))
776                 return;
777         val = iommu->gcmd | DMA_GCMD_WBF;
778
779         spin_lock_irqsave(&iommu->register_lock, flag);
780         writel(val, iommu->reg + DMAR_GCMD_REG);
781
782         /* Make sure hardware complete it */
783         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
784                         readl, (!(val & DMA_GSTS_WBFS)), val);
785
786         spin_unlock_irqrestore(&iommu->register_lock, flag);
787 }
788
789 /* return value determine if we need a write buffer flush */
790 static int __iommu_flush_context(struct intel_iommu *iommu,
791         u16 did, u16 source_id, u8 function_mask, u64 type,
792         int non_present_entry_flush)
793 {
794         u64 val = 0;
795         unsigned long flag;
796
797         /*
798          * In the non-present entry flush case, if hardware doesn't cache
799          * non-present entry we do nothing and if hardware cache non-present
800          * entry, we flush entries of domain 0 (the domain id is used to cache
801          * any non-present entries)
802          */
803         if (non_present_entry_flush) {
804                 if (!cap_caching_mode(iommu->cap))
805                         return 1;
806                 else
807                         did = 0;
808         }
809
810         switch (type) {
811         case DMA_CCMD_GLOBAL_INVL:
812                 val = DMA_CCMD_GLOBAL_INVL;
813                 break;
814         case DMA_CCMD_DOMAIN_INVL:
815                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
816                 break;
817         case DMA_CCMD_DEVICE_INVL:
818                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
819                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
820                 break;
821         default:
822                 BUG();
823         }
824         val |= DMA_CCMD_ICC;
825
826         spin_lock_irqsave(&iommu->register_lock, flag);
827         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
828
829         /* Make sure hardware complete it */
830         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
831                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
832
833         spin_unlock_irqrestore(&iommu->register_lock, flag);
834
835         /* flush context entry will implicitly flush write buffer */
836         return 0;
837 }
838
839 /* return value determine if we need a write buffer flush */
840 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
841         u64 addr, unsigned int size_order, u64 type,
842         int non_present_entry_flush)
843 {
844         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
845         u64 val = 0, val_iva = 0;
846         unsigned long flag;
847
848         /*
849          * In the non-present entry flush case, if hardware doesn't cache
850          * non-present entry we do nothing and if hardware cache non-present
851          * entry, we flush entries of domain 0 (the domain id is used to cache
852          * any non-present entries)
853          */
854         if (non_present_entry_flush) {
855                 if (!cap_caching_mode(iommu->cap))
856                         return 1;
857                 else
858                         did = 0;
859         }
860
861         switch (type) {
862         case DMA_TLB_GLOBAL_FLUSH:
863                 /* global flush doesn't need set IVA_REG */
864                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
865                 break;
866         case DMA_TLB_DSI_FLUSH:
867                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
868                 break;
869         case DMA_TLB_PSI_FLUSH:
870                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
871                 /* Note: always flush non-leaf currently */
872                 val_iva = size_order | addr;
873                 break;
874         default:
875                 BUG();
876         }
877         /* Note: set drain read/write */
878 #if 0
879         /*
880          * This is probably to be super secure.. Looks like we can
881          * ignore it without any impact.
882          */
883         if (cap_read_drain(iommu->cap))
884                 val |= DMA_TLB_READ_DRAIN;
885 #endif
886         if (cap_write_drain(iommu->cap))
887                 val |= DMA_TLB_WRITE_DRAIN;
888
889         spin_lock_irqsave(&iommu->register_lock, flag);
890         /* Note: Only uses first TLB reg currently */
891         if (val_iva)
892                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
893         dmar_writeq(iommu->reg + tlb_offset + 8, val);
894
895         /* Make sure hardware complete it */
896         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
897                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
898
899         spin_unlock_irqrestore(&iommu->register_lock, flag);
900
901         /* check IOTLB invalidation granularity */
902         if (DMA_TLB_IAIG(val) == 0)
903                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
904         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
905                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
906                         (unsigned long long)DMA_TLB_IIRG(type),
907                         (unsigned long long)DMA_TLB_IAIG(val));
908         /* flush iotlb entry will implicitly flush write buffer */
909         return 0;
910 }
911
912 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
913         u64 addr, unsigned int pages, int non_present_entry_flush)
914 {
915         unsigned int mask;
916
917         BUG_ON(addr & (~VTD_PAGE_MASK));
918         BUG_ON(pages == 0);
919
920         /* Fallback to domain selective flush if no PSI support */
921         if (!cap_pgsel_inv(iommu->cap))
922                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
923                                                 DMA_TLB_DSI_FLUSH,
924                                                 non_present_entry_flush);
925
926         /*
927          * PSI requires page size to be 2 ^ x, and the base address is naturally
928          * aligned to the size
929          */
930         mask = ilog2(__roundup_pow_of_two(pages));
931         /* Fallback to domain selective flush if size is too big */
932         if (mask > cap_max_amask_val(iommu->cap))
933                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
934                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
935
936         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
937                                         DMA_TLB_PSI_FLUSH,
938                                         non_present_entry_flush);
939 }
940
941 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
942 {
943         u32 pmen;
944         unsigned long flags;
945
946         spin_lock_irqsave(&iommu->register_lock, flags);
947         pmen = readl(iommu->reg + DMAR_PMEN_REG);
948         pmen &= ~DMA_PMEN_EPM;
949         writel(pmen, iommu->reg + DMAR_PMEN_REG);
950
951         /* wait for the protected region status bit to clear */
952         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
953                 readl, !(pmen & DMA_PMEN_PRS), pmen);
954
955         spin_unlock_irqrestore(&iommu->register_lock, flags);
956 }
957
958 static int iommu_enable_translation(struct intel_iommu *iommu)
959 {
960         u32 sts;
961         unsigned long flags;
962
963         spin_lock_irqsave(&iommu->register_lock, flags);
964         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
965
966         /* Make sure hardware complete it */
967         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
968                 readl, (sts & DMA_GSTS_TES), sts);
969
970         iommu->gcmd |= DMA_GCMD_TE;
971         spin_unlock_irqrestore(&iommu->register_lock, flags);
972         return 0;
973 }
974
975 static int iommu_disable_translation(struct intel_iommu *iommu)
976 {
977         u32 sts;
978         unsigned long flag;
979
980         spin_lock_irqsave(&iommu->register_lock, flag);
981         iommu->gcmd &= ~DMA_GCMD_TE;
982         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
983
984         /* Make sure hardware complete it */
985         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
986                 readl, (!(sts & DMA_GSTS_TES)), sts);
987
988         spin_unlock_irqrestore(&iommu->register_lock, flag);
989         return 0;
990 }
991
992 /* iommu interrupt handling. Most stuff are MSI-like. */
993
994 static const char *fault_reason_strings[] =
995 {
996         "Software",
997         "Present bit in root entry is clear",
998         "Present bit in context entry is clear",
999         "Invalid context entry",
1000         "Access beyond MGAW",
1001         "PTE Write access is not set",
1002         "PTE Read access is not set",
1003         "Next page table ptr is invalid",
1004         "Root table address invalid",
1005         "Context table ptr is invalid",
1006         "non-zero reserved fields in RTP",
1007         "non-zero reserved fields in CTP",
1008         "non-zero reserved fields in PTE",
1009 };
1010 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1011
1012 const char *dmar_get_fault_reason(u8 fault_reason)
1013 {
1014         if (fault_reason > MAX_FAULT_REASON_IDX)
1015                 return "Unknown";
1016         else
1017                 return fault_reason_strings[fault_reason];
1018 }
1019
1020 void dmar_msi_unmask(unsigned int irq)
1021 {
1022         struct intel_iommu *iommu = get_irq_data(irq);
1023         unsigned long flag;
1024
1025         /* unmask it */
1026         spin_lock_irqsave(&iommu->register_lock, flag);
1027         writel(0, iommu->reg + DMAR_FECTL_REG);
1028         /* Read a reg to force flush the post write */
1029         readl(iommu->reg + DMAR_FECTL_REG);
1030         spin_unlock_irqrestore(&iommu->register_lock, flag);
1031 }
1032
1033 void dmar_msi_mask(unsigned int irq)
1034 {
1035         unsigned long flag;
1036         struct intel_iommu *iommu = get_irq_data(irq);
1037
1038         /* mask it */
1039         spin_lock_irqsave(&iommu->register_lock, flag);
1040         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1041         /* Read a reg to force flush the post write */
1042         readl(iommu->reg + DMAR_FECTL_REG);
1043         spin_unlock_irqrestore(&iommu->register_lock, flag);
1044 }
1045
1046 void dmar_msi_write(int irq, struct msi_msg *msg)
1047 {
1048         struct intel_iommu *iommu = get_irq_data(irq);
1049         unsigned long flag;
1050
1051         spin_lock_irqsave(&iommu->register_lock, flag);
1052         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1053         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1054         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1055         spin_unlock_irqrestore(&iommu->register_lock, flag);
1056 }
1057
1058 void dmar_msi_read(int irq, struct msi_msg *msg)
1059 {
1060         struct intel_iommu *iommu = get_irq_data(irq);
1061         unsigned long flag;
1062
1063         spin_lock_irqsave(&iommu->register_lock, flag);
1064         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1065         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1066         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1067         spin_unlock_irqrestore(&iommu->register_lock, flag);
1068 }
1069
1070 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1071                 u8 fault_reason, u16 source_id, unsigned long long addr)
1072 {
1073         const char *reason;
1074
1075         reason = dmar_get_fault_reason(fault_reason);
1076
1077         printk(KERN_ERR
1078                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1079                 "fault addr %llx \n"
1080                 "DMAR:[fault reason %02d] %s\n",
1081                 (type ? "DMA Read" : "DMA Write"),
1082                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1083                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1084         return 0;
1085 }
1086
1087 #define PRIMARY_FAULT_REG_LEN (16)
1088 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1089 {
1090         struct intel_iommu *iommu = dev_id;
1091         int reg, fault_index;
1092         u32 fault_status;
1093         unsigned long flag;
1094
1095         spin_lock_irqsave(&iommu->register_lock, flag);
1096         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1097
1098         /* TBD: ignore advanced fault log currently */
1099         if (!(fault_status & DMA_FSTS_PPF))
1100                 goto clear_overflow;
1101
1102         fault_index = dma_fsts_fault_record_index(fault_status);
1103         reg = cap_fault_reg_offset(iommu->cap);
1104         while (1) {
1105                 u8 fault_reason;
1106                 u16 source_id;
1107                 u64 guest_addr;
1108                 int type;
1109                 u32 data;
1110
1111                 /* highest 32 bits */
1112                 data = readl(iommu->reg + reg +
1113                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1114                 if (!(data & DMA_FRCD_F))
1115                         break;
1116
1117                 fault_reason = dma_frcd_fault_reason(data);
1118                 type = dma_frcd_type(data);
1119
1120                 data = readl(iommu->reg + reg +
1121                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1122                 source_id = dma_frcd_source_id(data);
1123
1124                 guest_addr = dmar_readq(iommu->reg + reg +
1125                                 fault_index * PRIMARY_FAULT_REG_LEN);
1126                 guest_addr = dma_frcd_page_addr(guest_addr);
1127                 /* clear the fault */
1128                 writel(DMA_FRCD_F, iommu->reg + reg +
1129                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1130
1131                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1132
1133                 iommu_page_fault_do_one(iommu, type, fault_reason,
1134                                 source_id, guest_addr);
1135
1136                 fault_index++;
1137                 if (fault_index > cap_num_fault_regs(iommu->cap))
1138                         fault_index = 0;
1139                 spin_lock_irqsave(&iommu->register_lock, flag);
1140         }
1141 clear_overflow:
1142         /* clear primary fault overflow */
1143         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1144         if (fault_status & DMA_FSTS_PFO)
1145                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1146
1147         spin_unlock_irqrestore(&iommu->register_lock, flag);
1148         return IRQ_HANDLED;
1149 }
1150
1151 int dmar_set_interrupt(struct intel_iommu *iommu)
1152 {
1153         int irq, ret;
1154
1155         irq = create_irq();
1156         if (!irq) {
1157                 printk(KERN_ERR "IOMMU: no free vectors\n");
1158                 return -EINVAL;
1159         }
1160
1161         set_irq_data(irq, iommu);
1162         iommu->irq = irq;
1163
1164         ret = arch_setup_dmar_msi(irq);
1165         if (ret) {
1166                 set_irq_data(irq, NULL);
1167                 iommu->irq = 0;
1168                 destroy_irq(irq);
1169                 return 0;
1170         }
1171
1172         /* Force fault register is cleared */
1173         iommu_page_fault(irq, iommu);
1174
1175         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1176         if (ret)
1177                 printk(KERN_ERR "IOMMU: can't request irq\n");
1178         return ret;
1179 }
1180
1181 static int iommu_init_domains(struct intel_iommu *iommu)
1182 {
1183         unsigned long ndomains;
1184         unsigned long nlongs;
1185
1186         ndomains = cap_ndoms(iommu->cap);
1187         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1188         nlongs = BITS_TO_LONGS(ndomains);
1189
1190         /* TBD: there might be 64K domains,
1191          * consider other allocation for future chip
1192          */
1193         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1194         if (!iommu->domain_ids) {
1195                 printk(KERN_ERR "Allocating domain id array failed\n");
1196                 return -ENOMEM;
1197         }
1198         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1199                         GFP_KERNEL);
1200         if (!iommu->domains) {
1201                 printk(KERN_ERR "Allocating domain array failed\n");
1202                 kfree(iommu->domain_ids);
1203                 return -ENOMEM;
1204         }
1205
1206         spin_lock_init(&iommu->lock);
1207
1208         /*
1209          * if Caching mode is set, then invalid translations are tagged
1210          * with domainid 0. Hence we need to pre-allocate it.
1211          */
1212         if (cap_caching_mode(iommu->cap))
1213                 set_bit(0, iommu->domain_ids);
1214         return 0;
1215 }
1216
1217
1218 static void domain_exit(struct dmar_domain *domain);
1219 static void vm_domain_exit(struct dmar_domain *domain);
1220
1221 void free_dmar_iommu(struct intel_iommu *iommu)
1222 {
1223         struct dmar_domain *domain;
1224         int i;
1225         unsigned long flags;
1226
1227         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1228         for (; i < cap_ndoms(iommu->cap); ) {
1229                 domain = iommu->domains[i];
1230                 clear_bit(i, iommu->domain_ids);
1231
1232                 spin_lock_irqsave(&domain->iommu_lock, flags);
1233                 if (--domain->iommu_count == 0) {
1234                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1235                                 vm_domain_exit(domain);
1236                         else
1237                                 domain_exit(domain);
1238                 }
1239                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1240
1241                 i = find_next_bit(iommu->domain_ids,
1242                         cap_ndoms(iommu->cap), i+1);
1243         }
1244
1245         if (iommu->gcmd & DMA_GCMD_TE)
1246                 iommu_disable_translation(iommu);
1247
1248         if (iommu->irq) {
1249                 set_irq_data(iommu->irq, NULL);
1250                 /* This will mask the irq */
1251                 free_irq(iommu->irq, iommu);
1252                 destroy_irq(iommu->irq);
1253         }
1254
1255         kfree(iommu->domains);
1256         kfree(iommu->domain_ids);
1257
1258         g_iommus[iommu->seq_id] = NULL;
1259
1260         /* if all iommus are freed, free g_iommus */
1261         for (i = 0; i < g_num_of_iommus; i++) {
1262                 if (g_iommus[i])
1263                         break;
1264         }
1265
1266         if (i == g_num_of_iommus)
1267                 kfree(g_iommus);
1268
1269         /* free context mapping */
1270         free_context_table(iommu);
1271 }
1272
1273 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1274 {
1275         unsigned long num;
1276         unsigned long ndomains;
1277         struct dmar_domain *domain;
1278         unsigned long flags;
1279
1280         domain = alloc_domain_mem();
1281         if (!domain)
1282                 return NULL;
1283
1284         ndomains = cap_ndoms(iommu->cap);
1285
1286         spin_lock_irqsave(&iommu->lock, flags);
1287         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1288         if (num >= ndomains) {
1289                 spin_unlock_irqrestore(&iommu->lock, flags);
1290                 free_domain_mem(domain);
1291                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1292                 return NULL;
1293         }
1294
1295         set_bit(num, iommu->domain_ids);
1296         domain->id = num;
1297         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1298         set_bit(iommu->seq_id, &domain->iommu_bmp);
1299         domain->flags = 0;
1300         iommu->domains[num] = domain;
1301         spin_unlock_irqrestore(&iommu->lock, flags);
1302
1303         return domain;
1304 }
1305
1306 static void iommu_free_domain(struct dmar_domain *domain)
1307 {
1308         unsigned long flags;
1309         struct intel_iommu *iommu;
1310
1311         iommu = domain_get_iommu(domain);
1312
1313         spin_lock_irqsave(&iommu->lock, flags);
1314         clear_bit(domain->id, iommu->domain_ids);
1315         spin_unlock_irqrestore(&iommu->lock, flags);
1316 }
1317
1318 static struct iova_domain reserved_iova_list;
1319 static struct lock_class_key reserved_alloc_key;
1320 static struct lock_class_key reserved_rbtree_key;
1321
1322 static void dmar_init_reserved_ranges(void)
1323 {
1324         struct pci_dev *pdev = NULL;
1325         struct iova *iova;
1326         int i;
1327         u64 addr, size;
1328
1329         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1330
1331         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1332                 &reserved_alloc_key);
1333         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1334                 &reserved_rbtree_key);
1335
1336         /* IOAPIC ranges shouldn't be accessed by DMA */
1337         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1338                 IOVA_PFN(IOAPIC_RANGE_END));
1339         if (!iova)
1340                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1341
1342         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1343         for_each_pci_dev(pdev) {
1344                 struct resource *r;
1345
1346                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1347                         r = &pdev->resource[i];
1348                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1349                                 continue;
1350                         addr = r->start;
1351                         addr &= PAGE_MASK;
1352                         size = r->end - addr;
1353                         size = PAGE_ALIGN(size);
1354                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1355                                 IOVA_PFN(size + addr) - 1);
1356                         if (!iova)
1357                                 printk(KERN_ERR "Reserve iova failed\n");
1358                 }
1359         }
1360
1361 }
1362
1363 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1364 {
1365         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1366 }
1367
1368 static inline int guestwidth_to_adjustwidth(int gaw)
1369 {
1370         int agaw;
1371         int r = (gaw - 12) % 9;
1372
1373         if (r == 0)
1374                 agaw = gaw;
1375         else
1376                 agaw = gaw + 9 - r;
1377         if (agaw > 64)
1378                 agaw = 64;
1379         return agaw;
1380 }
1381
1382 static int domain_init(struct dmar_domain *domain, int guest_width)
1383 {
1384         struct intel_iommu *iommu;
1385         int adjust_width, agaw;
1386         unsigned long sagaw;
1387
1388         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1389         spin_lock_init(&domain->mapping_lock);
1390         spin_lock_init(&domain->iommu_lock);
1391
1392         domain_reserve_special_ranges(domain);
1393
1394         /* calculate AGAW */
1395         iommu = domain_get_iommu(domain);
1396         if (guest_width > cap_mgaw(iommu->cap))
1397                 guest_width = cap_mgaw(iommu->cap);
1398         domain->gaw = guest_width;
1399         adjust_width = guestwidth_to_adjustwidth(guest_width);
1400         agaw = width_to_agaw(adjust_width);
1401         sagaw = cap_sagaw(iommu->cap);
1402         if (!test_bit(agaw, &sagaw)) {
1403                 /* hardware doesn't support it, choose a bigger one */
1404                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1405                 agaw = find_next_bit(&sagaw, 5, agaw);
1406                 if (agaw >= 5)
1407                         return -ENODEV;
1408         }
1409         domain->agaw = agaw;
1410         INIT_LIST_HEAD(&domain->devices);
1411
1412         if (ecap_coherent(iommu->ecap))
1413                 domain->iommu_coherency = 1;
1414         else
1415                 domain->iommu_coherency = 0;
1416
1417         domain->iommu_count = 1;
1418
1419         /* always allocate the top pgd */
1420         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1421         if (!domain->pgd)
1422                 return -ENOMEM;
1423         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1424         return 0;
1425 }
1426
1427 static void domain_exit(struct dmar_domain *domain)
1428 {
1429         u64 end;
1430
1431         /* Domain 0 is reserved, so dont process it */
1432         if (!domain)
1433                 return;
1434
1435         domain_remove_dev_info(domain);
1436         /* destroy iovas */
1437         put_iova_domain(&domain->iovad);
1438         end = DOMAIN_MAX_ADDR(domain->gaw);
1439         end = end & (~PAGE_MASK);
1440
1441         /* clear ptes */
1442         dma_pte_clear_range(domain, 0, end);
1443
1444         /* free page tables */
1445         dma_pte_free_pagetable(domain, 0, end);
1446
1447         iommu_free_domain(domain);
1448         free_domain_mem(domain);
1449 }
1450
1451 static int domain_context_mapping_one(struct dmar_domain *domain,
1452                 u8 bus, u8 devfn)
1453 {
1454         struct context_entry *context;
1455         unsigned long flags;
1456         struct intel_iommu *iommu;
1457
1458         pr_debug("Set context mapping for %02x:%02x.%d\n",
1459                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1460         BUG_ON(!domain->pgd);
1461
1462         iommu = device_to_iommu(bus, devfn);
1463         if (!iommu)
1464                 return -ENODEV;
1465
1466         context = device_to_context_entry(iommu, bus, devfn);
1467         if (!context)
1468                 return -ENOMEM;
1469         spin_lock_irqsave(&iommu->lock, flags);
1470         if (context_present(context)) {
1471                 spin_unlock_irqrestore(&iommu->lock, flags);
1472                 return 0;
1473         }
1474
1475         context_set_domain_id(context, domain->id);
1476         context_set_address_width(context, domain->agaw);
1477         context_set_address_root(context, virt_to_phys(domain->pgd));
1478         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1479         context_set_fault_enable(context);
1480         context_set_present(context);
1481         domain_flush_cache(domain, context, sizeof(*context));
1482
1483         /* it's a non-present to present mapping */
1484         if (iommu->flush.flush_context(iommu, domain->id,
1485                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1486                 DMA_CCMD_DEVICE_INVL, 1))
1487                 iommu_flush_write_buffer(iommu);
1488         else
1489                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1490
1491         spin_unlock_irqrestore(&iommu->lock, flags);
1492
1493         spin_lock_irqsave(&domain->iommu_lock, flags);
1494         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1495                 domain->iommu_count++;
1496                 domain_update_iommu_coherency(domain);
1497         }
1498         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1499         return 0;
1500 }
1501
1502 static int
1503 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1504 {
1505         int ret;
1506         struct pci_dev *tmp, *parent;
1507
1508         ret = domain_context_mapping_one(domain, pdev->bus->number,
1509                 pdev->devfn);
1510         if (ret)
1511                 return ret;
1512
1513         /* dependent device mapping */
1514         tmp = pci_find_upstream_pcie_bridge(pdev);
1515         if (!tmp)
1516                 return 0;
1517         /* Secondary interface's bus number and devfn 0 */
1518         parent = pdev->bus->self;
1519         while (parent != tmp) {
1520                 ret = domain_context_mapping_one(domain, parent->bus->number,
1521                         parent->devfn);
1522                 if (ret)
1523                         return ret;
1524                 parent = parent->bus->self;
1525         }
1526         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1527                 return domain_context_mapping_one(domain,
1528                         tmp->subordinate->number, 0);
1529         else /* this is a legacy PCI bridge */
1530                 return domain_context_mapping_one(domain,
1531                         tmp->bus->number, tmp->devfn);
1532 }
1533
1534 static int domain_context_mapped(struct pci_dev *pdev)
1535 {
1536         int ret;
1537         struct pci_dev *tmp, *parent;
1538         struct intel_iommu *iommu;
1539
1540         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1541         if (!iommu)
1542                 return -ENODEV;
1543
1544         ret = device_context_mapped(iommu,
1545                 pdev->bus->number, pdev->devfn);
1546         if (!ret)
1547                 return ret;
1548         /* dependent device mapping */
1549         tmp = pci_find_upstream_pcie_bridge(pdev);
1550         if (!tmp)
1551                 return ret;
1552         /* Secondary interface's bus number and devfn 0 */
1553         parent = pdev->bus->self;
1554         while (parent != tmp) {
1555                 ret = device_context_mapped(iommu, parent->bus->number,
1556                         parent->devfn);
1557                 if (!ret)
1558                         return ret;
1559                 parent = parent->bus->self;
1560         }
1561         if (tmp->is_pcie)
1562                 return device_context_mapped(iommu,
1563                         tmp->subordinate->number, 0);
1564         else
1565                 return device_context_mapped(iommu,
1566                         tmp->bus->number, tmp->devfn);
1567 }
1568
1569 static int
1570 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1571                         u64 hpa, size_t size, int prot)
1572 {
1573         u64 start_pfn, end_pfn;
1574         struct dma_pte *pte;
1575         int index;
1576         int addr_width = agaw_to_width(domain->agaw);
1577
1578         hpa &= (((u64)1) << addr_width) - 1;
1579
1580         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1581                 return -EINVAL;
1582         iova &= PAGE_MASK;
1583         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1584         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1585         index = 0;
1586         while (start_pfn < end_pfn) {
1587                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1588                 if (!pte)
1589                         return -ENOMEM;
1590                 /* We don't need lock here, nobody else
1591                  * touches the iova range
1592                  */
1593                 BUG_ON(dma_pte_addr(pte));
1594                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1595                 dma_set_pte_prot(pte, prot);
1596                 domain_flush_cache(domain, pte, sizeof(*pte));
1597                 start_pfn++;
1598                 index++;
1599         }
1600         return 0;
1601 }
1602
1603 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1604 {
1605         if (!iommu)
1606                 return;
1607
1608         clear_context_table(iommu, bus, devfn);
1609         iommu->flush.flush_context(iommu, 0, 0, 0,
1610                                            DMA_CCMD_GLOBAL_INVL, 0);
1611         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1612                                          DMA_TLB_GLOBAL_FLUSH, 0);
1613 }
1614
1615 static void domain_remove_dev_info(struct dmar_domain *domain)
1616 {
1617         struct device_domain_info *info;
1618         unsigned long flags;
1619         struct intel_iommu *iommu;
1620
1621         spin_lock_irqsave(&device_domain_lock, flags);
1622         while (!list_empty(&domain->devices)) {
1623                 info = list_entry(domain->devices.next,
1624                         struct device_domain_info, link);
1625                 list_del(&info->link);
1626                 list_del(&info->global);
1627                 if (info->dev)
1628                         info->dev->dev.archdata.iommu = NULL;
1629                 spin_unlock_irqrestore(&device_domain_lock, flags);
1630
1631                 iommu = device_to_iommu(info->bus, info->devfn);
1632                 iommu_detach_dev(iommu, info->bus, info->devfn);
1633                 free_devinfo_mem(info);
1634
1635                 spin_lock_irqsave(&device_domain_lock, flags);
1636         }
1637         spin_unlock_irqrestore(&device_domain_lock, flags);
1638 }
1639
1640 /*
1641  * find_domain
1642  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1643  */
1644 static struct dmar_domain *
1645 find_domain(struct pci_dev *pdev)
1646 {
1647         struct device_domain_info *info;
1648
1649         /* No lock here, assumes no domain exit in normal case */
1650         info = pdev->dev.archdata.iommu;
1651         if (info)
1652                 return info->domain;
1653         return NULL;
1654 }
1655
1656 /* domain is initialized */
1657 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1658 {
1659         struct dmar_domain *domain, *found = NULL;
1660         struct intel_iommu *iommu;
1661         struct dmar_drhd_unit *drhd;
1662         struct device_domain_info *info, *tmp;
1663         struct pci_dev *dev_tmp;
1664         unsigned long flags;
1665         int bus = 0, devfn = 0;
1666
1667         domain = find_domain(pdev);
1668         if (domain)
1669                 return domain;
1670
1671         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1672         if (dev_tmp) {
1673                 if (dev_tmp->is_pcie) {
1674                         bus = dev_tmp->subordinate->number;
1675                         devfn = 0;
1676                 } else {
1677                         bus = dev_tmp->bus->number;
1678                         devfn = dev_tmp->devfn;
1679                 }
1680                 spin_lock_irqsave(&device_domain_lock, flags);
1681                 list_for_each_entry(info, &device_domain_list, global) {
1682                         if (info->bus == bus && info->devfn == devfn) {
1683                                 found = info->domain;
1684                                 break;
1685                         }
1686                 }
1687                 spin_unlock_irqrestore(&device_domain_lock, flags);
1688                 /* pcie-pci bridge already has a domain, uses it */
1689                 if (found) {
1690                         domain = found;
1691                         goto found_domain;
1692                 }
1693         }
1694
1695         /* Allocate new domain for the device */
1696         drhd = dmar_find_matched_drhd_unit(pdev);
1697         if (!drhd) {
1698                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1699                         pci_name(pdev));
1700                 return NULL;
1701         }
1702         iommu = drhd->iommu;
1703
1704         domain = iommu_alloc_domain(iommu);
1705         if (!domain)
1706                 goto error;
1707
1708         if (domain_init(domain, gaw)) {
1709                 domain_exit(domain);
1710                 goto error;
1711         }
1712
1713         /* register pcie-to-pci device */
1714         if (dev_tmp) {
1715                 info = alloc_devinfo_mem();
1716                 if (!info) {
1717                         domain_exit(domain);
1718                         goto error;
1719                 }
1720                 info->bus = bus;
1721                 info->devfn = devfn;
1722                 info->dev = NULL;
1723                 info->domain = domain;
1724                 /* This domain is shared by devices under p2p bridge */
1725                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1726
1727                 /* pcie-to-pci bridge already has a domain, uses it */
1728                 found = NULL;
1729                 spin_lock_irqsave(&device_domain_lock, flags);
1730                 list_for_each_entry(tmp, &device_domain_list, global) {
1731                         if (tmp->bus == bus && tmp->devfn == devfn) {
1732                                 found = tmp->domain;
1733                                 break;
1734                         }
1735                 }
1736                 if (found) {
1737                         free_devinfo_mem(info);
1738                         domain_exit(domain);
1739                         domain = found;
1740                 } else {
1741                         list_add(&info->link, &domain->devices);
1742                         list_add(&info->global, &device_domain_list);
1743                 }
1744                 spin_unlock_irqrestore(&device_domain_lock, flags);
1745         }
1746
1747 found_domain:
1748         info = alloc_devinfo_mem();
1749         if (!info)
1750                 goto error;
1751         info->bus = pdev->bus->number;
1752         info->devfn = pdev->devfn;
1753         info->dev = pdev;
1754         info->domain = domain;
1755         spin_lock_irqsave(&device_domain_lock, flags);
1756         /* somebody is fast */
1757         found = find_domain(pdev);
1758         if (found != NULL) {
1759                 spin_unlock_irqrestore(&device_domain_lock, flags);
1760                 if (found != domain) {
1761                         domain_exit(domain);
1762                         domain = found;
1763                 }
1764                 free_devinfo_mem(info);
1765                 return domain;
1766         }
1767         list_add(&info->link, &domain->devices);
1768         list_add(&info->global, &device_domain_list);
1769         pdev->dev.archdata.iommu = info;
1770         spin_unlock_irqrestore(&device_domain_lock, flags);
1771         return domain;
1772 error:
1773         /* recheck it here, maybe others set it */
1774         return find_domain(pdev);
1775 }
1776
1777 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1778                                       unsigned long long start,
1779                                       unsigned long long end)
1780 {
1781         struct dmar_domain *domain;
1782         unsigned long size;
1783         unsigned long long base;
1784         int ret;
1785
1786         printk(KERN_INFO
1787                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1788                 pci_name(pdev), start, end);
1789         /* page table init */
1790         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1791         if (!domain)
1792                 return -ENOMEM;
1793
1794         /* The address might not be aligned */
1795         base = start & PAGE_MASK;
1796         size = end - base;
1797         size = PAGE_ALIGN(size);
1798         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1799                         IOVA_PFN(base + size) - 1)) {
1800                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1801                 ret = -ENOMEM;
1802                 goto error;
1803         }
1804
1805         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1806                 size, base, pci_name(pdev));
1807         /*
1808          * RMRR range might have overlap with physical memory range,
1809          * clear it first
1810          */
1811         dma_pte_clear_range(domain, base, base + size);
1812
1813         ret = domain_page_mapping(domain, base, base, size,
1814                 DMA_PTE_READ|DMA_PTE_WRITE);
1815         if (ret)
1816                 goto error;
1817
1818         /* context entry init */
1819         ret = domain_context_mapping(domain, pdev);
1820         if (!ret)
1821                 return 0;
1822 error:
1823         domain_exit(domain);
1824         return ret;
1825
1826 }
1827
1828 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1829         struct pci_dev *pdev)
1830 {
1831         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1832                 return 0;
1833         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1834                 rmrr->end_address + 1);
1835 }
1836
1837 #ifdef CONFIG_DMAR_GFX_WA
1838 struct iommu_prepare_data {
1839         struct pci_dev *pdev;
1840         int ret;
1841 };
1842
1843 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1844                                          unsigned long end_pfn, void *datax)
1845 {
1846         struct iommu_prepare_data *data;
1847
1848         data = (struct iommu_prepare_data *)datax;
1849
1850         data->ret = iommu_prepare_identity_map(data->pdev,
1851                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1852         return data->ret;
1853
1854 }
1855
1856 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1857 {
1858         int nid;
1859         struct iommu_prepare_data data;
1860
1861         data.pdev = pdev;
1862         data.ret = 0;
1863
1864         for_each_online_node(nid) {
1865                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1866                 if (data.ret)
1867                         return data.ret;
1868         }
1869         return data.ret;
1870 }
1871
1872 static void __init iommu_prepare_gfx_mapping(void)
1873 {
1874         struct pci_dev *pdev = NULL;
1875         int ret;
1876
1877         for_each_pci_dev(pdev) {
1878                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1879                                 !IS_GFX_DEVICE(pdev))
1880                         continue;
1881                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1882                         pci_name(pdev));
1883                 ret = iommu_prepare_with_active_regions(pdev);
1884                 if (ret)
1885                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1886         }
1887 }
1888 #else /* !CONFIG_DMAR_GFX_WA */
1889 static inline void iommu_prepare_gfx_mapping(void)
1890 {
1891         return;
1892 }
1893 #endif
1894
1895 #ifdef CONFIG_DMAR_FLOPPY_WA
1896 static inline void iommu_prepare_isa(void)
1897 {
1898         struct pci_dev *pdev;
1899         int ret;
1900
1901         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1902         if (!pdev)
1903                 return;
1904
1905         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1906         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1907
1908         if (ret)
1909                 printk("IOMMU: Failed to create 0-64M identity map, "
1910                         "floppy might not work\n");
1911
1912 }
1913 #else
1914 static inline void iommu_prepare_isa(void)
1915 {
1916         return;
1917 }
1918 #endif /* !CONFIG_DMAR_FLPY_WA */
1919
1920 static int __init init_dmars(void)
1921 {
1922         struct dmar_drhd_unit *drhd;
1923         struct dmar_rmrr_unit *rmrr;
1924         struct pci_dev *pdev;
1925         struct intel_iommu *iommu;
1926         int i, ret, unit = 0;
1927
1928         /*
1929          * for each drhd
1930          *    allocate root
1931          *    initialize and program root entry to not present
1932          * endfor
1933          */
1934         for_each_drhd_unit(drhd) {
1935                 g_num_of_iommus++;
1936                 /*
1937                  * lock not needed as this is only incremented in the single
1938                  * threaded kernel __init code path all other access are read
1939                  * only
1940                  */
1941         }
1942
1943         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1944                         GFP_KERNEL);
1945         if (!g_iommus) {
1946                 printk(KERN_ERR "Allocating global iommu array failed\n");
1947                 ret = -ENOMEM;
1948                 goto error;
1949         }
1950
1951         deferred_flush = kzalloc(g_num_of_iommus *
1952                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1953         if (!deferred_flush) {
1954                 kfree(g_iommus);
1955                 ret = -ENOMEM;
1956                 goto error;
1957         }
1958
1959         for_each_drhd_unit(drhd) {
1960                 if (drhd->ignored)
1961                         continue;
1962
1963                 iommu = drhd->iommu;
1964                 g_iommus[iommu->seq_id] = iommu;
1965
1966                 ret = iommu_init_domains(iommu);
1967                 if (ret)
1968                         goto error;
1969
1970                 /*
1971                  * TBD:
1972                  * we could share the same root & context tables
1973                  * amoung all IOMMU's. Need to Split it later.
1974                  */
1975                 ret = iommu_alloc_root_entry(iommu);
1976                 if (ret) {
1977                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1978                         goto error;
1979                 }
1980         }
1981
1982         for_each_drhd_unit(drhd) {
1983                 if (drhd->ignored)
1984                         continue;
1985
1986                 iommu = drhd->iommu;
1987                 if (dmar_enable_qi(iommu)) {
1988                         /*
1989                          * Queued Invalidate not enabled, use Register Based
1990                          * Invalidate
1991                          */
1992                         iommu->flush.flush_context = __iommu_flush_context;
1993                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1994                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1995                                "invalidation\n",
1996                                (unsigned long long)drhd->reg_base_addr);
1997                 } else {
1998                         iommu->flush.flush_context = qi_flush_context;
1999                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2000                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2001                                "invalidation\n",
2002                                (unsigned long long)drhd->reg_base_addr);
2003                 }
2004         }
2005
2006         /*
2007          * For each rmrr
2008          *   for each dev attached to rmrr
2009          *   do
2010          *     locate drhd for dev, alloc domain for dev
2011          *     allocate free domain
2012          *     allocate page table entries for rmrr
2013          *     if context not allocated for bus
2014          *           allocate and init context
2015          *           set present in root table for this bus
2016          *     init context with domain, translation etc
2017          *    endfor
2018          * endfor
2019          */
2020         for_each_rmrr_units(rmrr) {
2021                 for (i = 0; i < rmrr->devices_cnt; i++) {
2022                         pdev = rmrr->devices[i];
2023                         /* some BIOS lists non-exist devices in DMAR table */
2024                         if (!pdev)
2025                                 continue;
2026                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2027                         if (ret)
2028                                 printk(KERN_ERR
2029                                  "IOMMU: mapping reserved region failed\n");
2030                 }
2031         }
2032
2033         iommu_prepare_gfx_mapping();
2034
2035         iommu_prepare_isa();
2036
2037         /*
2038          * for each drhd
2039          *   enable fault log
2040          *   global invalidate context cache
2041          *   global invalidate iotlb
2042          *   enable translation
2043          */
2044         for_each_drhd_unit(drhd) {
2045                 if (drhd->ignored)
2046                         continue;
2047                 iommu = drhd->iommu;
2048                 sprintf (iommu->name, "dmar%d", unit++);
2049
2050                 iommu_flush_write_buffer(iommu);
2051
2052                 ret = dmar_set_interrupt(iommu);
2053                 if (ret)
2054                         goto error;
2055
2056                 iommu_set_root_entry(iommu);
2057
2058                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2059                                            0);
2060                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2061                                          0);
2062                 iommu_disable_protect_mem_regions(iommu);
2063
2064                 ret = iommu_enable_translation(iommu);
2065                 if (ret)
2066                         goto error;
2067         }
2068
2069         return 0;
2070 error:
2071         for_each_drhd_unit(drhd) {
2072                 if (drhd->ignored)
2073                         continue;
2074                 iommu = drhd->iommu;
2075                 free_iommu(iommu);
2076         }
2077         kfree(g_iommus);
2078         return ret;
2079 }
2080
2081 static inline u64 aligned_size(u64 host_addr, size_t size)
2082 {
2083         u64 addr;
2084         addr = (host_addr & (~PAGE_MASK)) + size;
2085         return PAGE_ALIGN(addr);
2086 }
2087
2088 struct iova *
2089 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2090 {
2091         struct iova *piova;
2092
2093         /* Make sure it's in range */
2094         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2095         if (!size || (IOVA_START_ADDR + size > end))
2096                 return NULL;
2097
2098         piova = alloc_iova(&domain->iovad,
2099                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2100         return piova;
2101 }
2102
2103 static struct iova *
2104 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2105                    size_t size, u64 dma_mask)
2106 {
2107         struct pci_dev *pdev = to_pci_dev(dev);
2108         struct iova *iova = NULL;
2109
2110         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2111                 iova = iommu_alloc_iova(domain, size, dma_mask);
2112         else {
2113                 /*
2114                  * First try to allocate an io virtual address in
2115                  * DMA_32BIT_MASK and if that fails then try allocating
2116                  * from higher range
2117                  */
2118                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2119                 if (!iova)
2120                         iova = iommu_alloc_iova(domain, size, dma_mask);
2121         }
2122
2123         if (!iova) {
2124                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2125                 return NULL;
2126         }
2127
2128         return iova;
2129 }
2130
2131 static struct dmar_domain *
2132 get_valid_domain_for_dev(struct pci_dev *pdev)
2133 {
2134         struct dmar_domain *domain;
2135         int ret;
2136
2137         domain = get_domain_for_dev(pdev,
2138                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2139         if (!domain) {
2140                 printk(KERN_ERR
2141                         "Allocating domain for %s failed", pci_name(pdev));
2142                 return NULL;
2143         }
2144
2145         /* make sure context mapping is ok */
2146         if (unlikely(!domain_context_mapped(pdev))) {
2147                 ret = domain_context_mapping(domain, pdev);
2148                 if (ret) {
2149                         printk(KERN_ERR
2150                                 "Domain context map for %s failed",
2151                                 pci_name(pdev));
2152                         return NULL;
2153                 }
2154         }
2155
2156         return domain;
2157 }
2158
2159 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2160                                      size_t size, int dir, u64 dma_mask)
2161 {
2162         struct pci_dev *pdev = to_pci_dev(hwdev);
2163         struct dmar_domain *domain;
2164         phys_addr_t start_paddr;
2165         struct iova *iova;
2166         int prot = 0;
2167         int ret;
2168         struct intel_iommu *iommu;
2169
2170         BUG_ON(dir == DMA_NONE);
2171         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2172                 return paddr;
2173
2174         domain = get_valid_domain_for_dev(pdev);
2175         if (!domain)
2176                 return 0;
2177
2178         iommu = domain_get_iommu(domain);
2179         size = aligned_size((u64)paddr, size);
2180
2181         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2182         if (!iova)
2183                 goto error;
2184
2185         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2186
2187         /*
2188          * Check if DMAR supports zero-length reads on write only
2189          * mappings..
2190          */
2191         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2192                         !cap_zlr(iommu->cap))
2193                 prot |= DMA_PTE_READ;
2194         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2195                 prot |= DMA_PTE_WRITE;
2196         /*
2197          * paddr - (paddr + size) might be partial page, we should map the whole
2198          * page.  Note: if two part of one page are separately mapped, we
2199          * might have two guest_addr mapping to the same host paddr, but this
2200          * is not a big problem
2201          */
2202         ret = domain_page_mapping(domain, start_paddr,
2203                 ((u64)paddr) & PAGE_MASK, size, prot);
2204         if (ret)
2205                 goto error;
2206
2207         /* it's a non-present to present mapping */
2208         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2209                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2210         if (ret)
2211                 iommu_flush_write_buffer(iommu);
2212
2213         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2214
2215 error:
2216         if (iova)
2217                 __free_iova(&domain->iovad, iova);
2218         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2219                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2220         return 0;
2221 }
2222
2223 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2224                             size_t size, int dir)
2225 {
2226         return __intel_map_single(hwdev, paddr, size, dir,
2227                                   to_pci_dev(hwdev)->dma_mask);
2228 }
2229
2230 static void flush_unmaps(void)
2231 {
2232         int i, j;
2233
2234         timer_on = 0;
2235
2236         /* just flush them all */
2237         for (i = 0; i < g_num_of_iommus; i++) {
2238                 struct intel_iommu *iommu = g_iommus[i];
2239                 if (!iommu)
2240                         continue;
2241
2242                 if (deferred_flush[i].next) {
2243                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2244                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2245                         for (j = 0; j < deferred_flush[i].next; j++) {
2246                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2247                                                 deferred_flush[i].iova[j]);
2248                         }
2249                         deferred_flush[i].next = 0;
2250                 }
2251         }
2252
2253         list_size = 0;
2254 }
2255
2256 static void flush_unmaps_timeout(unsigned long data)
2257 {
2258         unsigned long flags;
2259
2260         spin_lock_irqsave(&async_umap_flush_lock, flags);
2261         flush_unmaps();
2262         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2263 }
2264
2265 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2266 {
2267         unsigned long flags;
2268         int next, iommu_id;
2269         struct intel_iommu *iommu;
2270
2271         spin_lock_irqsave(&async_umap_flush_lock, flags);
2272         if (list_size == HIGH_WATER_MARK)
2273                 flush_unmaps();
2274
2275         iommu = domain_get_iommu(dom);
2276         iommu_id = iommu->seq_id;
2277
2278         next = deferred_flush[iommu_id].next;
2279         deferred_flush[iommu_id].domain[next] = dom;
2280         deferred_flush[iommu_id].iova[next] = iova;
2281         deferred_flush[iommu_id].next++;
2282
2283         if (!timer_on) {
2284                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2285                 timer_on = 1;
2286         }
2287         list_size++;
2288         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2289 }
2290
2291 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2292                         int dir)
2293 {
2294         struct pci_dev *pdev = to_pci_dev(dev);
2295         struct dmar_domain *domain;
2296         unsigned long start_addr;
2297         struct iova *iova;
2298         struct intel_iommu *iommu;
2299
2300         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2301                 return;
2302         domain = find_domain(pdev);
2303         BUG_ON(!domain);
2304
2305         iommu = domain_get_iommu(domain);
2306
2307         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2308         if (!iova)
2309                 return;
2310
2311         start_addr = iova->pfn_lo << PAGE_SHIFT;
2312         size = aligned_size((u64)dev_addr, size);
2313
2314         pr_debug("Device %s unmapping: %lx@%llx\n",
2315                 pci_name(pdev), size, (unsigned long long)start_addr);
2316
2317         /*  clear the whole page */
2318         dma_pte_clear_range(domain, start_addr, start_addr + size);
2319         /* free page tables */
2320         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2321         if (intel_iommu_strict) {
2322                 if (iommu_flush_iotlb_psi(iommu,
2323                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2324                         iommu_flush_write_buffer(iommu);
2325                 /* free iova */
2326                 __free_iova(&domain->iovad, iova);
2327         } else {
2328                 add_unmap(domain, iova);
2329                 /*
2330                  * queue up the release of the unmap to save the 1/6th of the
2331                  * cpu used up by the iotlb flush operation...
2332                  */
2333         }
2334 }
2335
2336 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2337                            dma_addr_t *dma_handle, gfp_t flags)
2338 {
2339         void *vaddr;
2340         int order;
2341
2342         size = PAGE_ALIGN(size);
2343         order = get_order(size);
2344         flags &= ~(GFP_DMA | GFP_DMA32);
2345
2346         vaddr = (void *)__get_free_pages(flags, order);
2347         if (!vaddr)
2348                 return NULL;
2349         memset(vaddr, 0, size);
2350
2351         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2352                                          DMA_BIDIRECTIONAL,
2353                                          hwdev->coherent_dma_mask);
2354         if (*dma_handle)
2355                 return vaddr;
2356         free_pages((unsigned long)vaddr, order);
2357         return NULL;
2358 }
2359
2360 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2361                          dma_addr_t dma_handle)
2362 {
2363         int order;
2364
2365         size = PAGE_ALIGN(size);
2366         order = get_order(size);
2367
2368         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2369         free_pages((unsigned long)vaddr, order);
2370 }
2371
2372 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2373
2374 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2375                     int nelems, int dir)
2376 {
2377         int i;
2378         struct pci_dev *pdev = to_pci_dev(hwdev);
2379         struct dmar_domain *domain;
2380         unsigned long start_addr;
2381         struct iova *iova;
2382         size_t size = 0;
2383         void *addr;
2384         struct scatterlist *sg;
2385         struct intel_iommu *iommu;
2386
2387         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2388                 return;
2389
2390         domain = find_domain(pdev);
2391         BUG_ON(!domain);
2392
2393         iommu = domain_get_iommu(domain);
2394
2395         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2396         if (!iova)
2397                 return;
2398         for_each_sg(sglist, sg, nelems, i) {
2399                 addr = SG_ENT_VIRT_ADDRESS(sg);
2400                 size += aligned_size((u64)addr, sg->length);
2401         }
2402
2403         start_addr = iova->pfn_lo << PAGE_SHIFT;
2404
2405         /*  clear the whole page */
2406         dma_pte_clear_range(domain, start_addr, start_addr + size);
2407         /* free page tables */
2408         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2409
2410         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2411                         size >> VTD_PAGE_SHIFT, 0))
2412                 iommu_flush_write_buffer(iommu);
2413
2414         /* free iova */
2415         __free_iova(&domain->iovad, iova);
2416 }
2417
2418 static int intel_nontranslate_map_sg(struct device *hddev,
2419         struct scatterlist *sglist, int nelems, int dir)
2420 {
2421         int i;
2422         struct scatterlist *sg;
2423
2424         for_each_sg(sglist, sg, nelems, i) {
2425                 BUG_ON(!sg_page(sg));
2426                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2427                 sg->dma_length = sg->length;
2428         }
2429         return nelems;
2430 }
2431
2432 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2433                  int dir)
2434 {
2435         void *addr;
2436         int i;
2437         struct pci_dev *pdev = to_pci_dev(hwdev);
2438         struct dmar_domain *domain;
2439         size_t size = 0;
2440         int prot = 0;
2441         size_t offset = 0;
2442         struct iova *iova = NULL;
2443         int ret;
2444         struct scatterlist *sg;
2445         unsigned long start_addr;
2446         struct intel_iommu *iommu;
2447
2448         BUG_ON(dir == DMA_NONE);
2449         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2450                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2451
2452         domain = get_valid_domain_for_dev(pdev);
2453         if (!domain)
2454                 return 0;
2455
2456         iommu = domain_get_iommu(domain);
2457
2458         for_each_sg(sglist, sg, nelems, i) {
2459                 addr = SG_ENT_VIRT_ADDRESS(sg);
2460                 addr = (void *)virt_to_phys(addr);
2461                 size += aligned_size((u64)addr, sg->length);
2462         }
2463
2464         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2465         if (!iova) {
2466                 sglist->dma_length = 0;
2467                 return 0;
2468         }
2469
2470         /*
2471          * Check if DMAR supports zero-length reads on write only
2472          * mappings..
2473          */
2474         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2475                         !cap_zlr(iommu->cap))
2476                 prot |= DMA_PTE_READ;
2477         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2478                 prot |= DMA_PTE_WRITE;
2479
2480         start_addr = iova->pfn_lo << PAGE_SHIFT;
2481         offset = 0;
2482         for_each_sg(sglist, sg, nelems, i) {
2483                 addr = SG_ENT_VIRT_ADDRESS(sg);
2484                 addr = (void *)virt_to_phys(addr);
2485                 size = aligned_size((u64)addr, sg->length);
2486                 ret = domain_page_mapping(domain, start_addr + offset,
2487                         ((u64)addr) & PAGE_MASK,
2488                         size, prot);
2489                 if (ret) {
2490                         /*  clear the page */
2491                         dma_pte_clear_range(domain, start_addr,
2492                                   start_addr + offset);
2493                         /* free page tables */
2494                         dma_pte_free_pagetable(domain, start_addr,
2495                                   start_addr + offset);
2496                         /* free iova */
2497                         __free_iova(&domain->iovad, iova);
2498                         return 0;
2499                 }
2500                 sg->dma_address = start_addr + offset +
2501                                 ((u64)addr & (~PAGE_MASK));
2502                 sg->dma_length = sg->length;
2503                 offset += size;
2504         }
2505
2506         /* it's a non-present to present mapping */
2507         if (iommu_flush_iotlb_psi(iommu, domain->id,
2508                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2509                 iommu_flush_write_buffer(iommu);
2510         return nelems;
2511 }
2512
2513 static struct dma_mapping_ops intel_dma_ops = {
2514         .alloc_coherent = intel_alloc_coherent,
2515         .free_coherent = intel_free_coherent,
2516         .map_single = intel_map_single,
2517         .unmap_single = intel_unmap_single,
2518         .map_sg = intel_map_sg,
2519         .unmap_sg = intel_unmap_sg,
2520 };
2521
2522 static inline int iommu_domain_cache_init(void)
2523 {
2524         int ret = 0;
2525
2526         iommu_domain_cache = kmem_cache_create("iommu_domain",
2527                                          sizeof(struct dmar_domain),
2528                                          0,
2529                                          SLAB_HWCACHE_ALIGN,
2530
2531                                          NULL);
2532         if (!iommu_domain_cache) {
2533                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2534                 ret = -ENOMEM;
2535         }
2536
2537         return ret;
2538 }
2539
2540 static inline int iommu_devinfo_cache_init(void)
2541 {
2542         int ret = 0;
2543
2544         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2545                                          sizeof(struct device_domain_info),
2546                                          0,
2547                                          SLAB_HWCACHE_ALIGN,
2548                                          NULL);
2549         if (!iommu_devinfo_cache) {
2550                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2551                 ret = -ENOMEM;
2552         }
2553
2554         return ret;
2555 }
2556
2557 static inline int iommu_iova_cache_init(void)
2558 {
2559         int ret = 0;
2560
2561         iommu_iova_cache = kmem_cache_create("iommu_iova",
2562                                          sizeof(struct iova),
2563                                          0,
2564                                          SLAB_HWCACHE_ALIGN,
2565                                          NULL);
2566         if (!iommu_iova_cache) {
2567                 printk(KERN_ERR "Couldn't create iova cache\n");
2568                 ret = -ENOMEM;
2569         }
2570
2571         return ret;
2572 }
2573
2574 static int __init iommu_init_mempool(void)
2575 {
2576         int ret;
2577         ret = iommu_iova_cache_init();
2578         if (ret)
2579                 return ret;
2580
2581         ret = iommu_domain_cache_init();
2582         if (ret)
2583                 goto domain_error;
2584
2585         ret = iommu_devinfo_cache_init();
2586         if (!ret)
2587                 return ret;
2588
2589         kmem_cache_destroy(iommu_domain_cache);
2590 domain_error:
2591         kmem_cache_destroy(iommu_iova_cache);
2592
2593         return -ENOMEM;
2594 }
2595
2596 static void __init iommu_exit_mempool(void)
2597 {
2598         kmem_cache_destroy(iommu_devinfo_cache);
2599         kmem_cache_destroy(iommu_domain_cache);
2600         kmem_cache_destroy(iommu_iova_cache);
2601
2602 }
2603
2604 static void __init init_no_remapping_devices(void)
2605 {
2606         struct dmar_drhd_unit *drhd;
2607
2608         for_each_drhd_unit(drhd) {
2609                 if (!drhd->include_all) {
2610                         int i;
2611                         for (i = 0; i < drhd->devices_cnt; i++)
2612                                 if (drhd->devices[i] != NULL)
2613                                         break;
2614                         /* ignore DMAR unit if no pci devices exist */
2615                         if (i == drhd->devices_cnt)
2616                                 drhd->ignored = 1;
2617                 }
2618         }
2619
2620         if (dmar_map_gfx)
2621                 return;
2622
2623         for_each_drhd_unit(drhd) {
2624                 int i;
2625                 if (drhd->ignored || drhd->include_all)
2626                         continue;
2627
2628                 for (i = 0; i < drhd->devices_cnt; i++)
2629                         if (drhd->devices[i] &&
2630                                 !IS_GFX_DEVICE(drhd->devices[i]))
2631                                 break;
2632
2633                 if (i < drhd->devices_cnt)
2634                         continue;
2635
2636                 /* bypass IOMMU if it is just for gfx devices */
2637                 drhd->ignored = 1;
2638                 for (i = 0; i < drhd->devices_cnt; i++) {
2639                         if (!drhd->devices[i])
2640                                 continue;
2641                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2642                 }
2643         }
2644 }
2645
2646 int __init intel_iommu_init(void)
2647 {
2648         int ret = 0;
2649
2650         if (dmar_table_init())
2651                 return  -ENODEV;
2652
2653         if (dmar_dev_scope_init())
2654                 return  -ENODEV;
2655
2656         /*
2657          * Check the need for DMA-remapping initialization now.
2658          * Above initialization will also be used by Interrupt-remapping.
2659          */
2660         if (no_iommu || swiotlb || dmar_disabled)
2661                 return -ENODEV;
2662
2663         iommu_init_mempool();
2664         dmar_init_reserved_ranges();
2665
2666         init_no_remapping_devices();
2667
2668         ret = init_dmars();
2669         if (ret) {
2670                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2671                 put_iova_domain(&reserved_iova_list);
2672                 iommu_exit_mempool();
2673                 return ret;
2674         }
2675         printk(KERN_INFO
2676         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2677
2678         init_timer(&unmap_timer);
2679         force_iommu = 1;
2680         dma_ops = &intel_dma_ops;
2681         return 0;
2682 }
2683
2684 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2685                                   struct pci_dev *pdev)
2686 {
2687         struct device_domain_info *info;
2688         unsigned long flags;
2689
2690         info = alloc_devinfo_mem();
2691         if (!info)
2692                 return -ENOMEM;
2693
2694         info->bus = pdev->bus->number;
2695         info->devfn = pdev->devfn;
2696         info->dev = pdev;
2697         info->domain = domain;
2698
2699         spin_lock_irqsave(&device_domain_lock, flags);
2700         list_add(&info->link, &domain->devices);
2701         list_add(&info->global, &device_domain_list);
2702         pdev->dev.archdata.iommu = info;
2703         spin_unlock_irqrestore(&device_domain_lock, flags);
2704
2705         return 0;
2706 }
2707
2708 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2709                                           struct pci_dev *pdev)
2710 {
2711         struct device_domain_info *info;
2712         struct intel_iommu *iommu;
2713         unsigned long flags;
2714         int found = 0;
2715         struct list_head *entry, *tmp;
2716
2717         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2718         if (!iommu)
2719                 return;
2720
2721         spin_lock_irqsave(&device_domain_lock, flags);
2722         list_for_each_safe(entry, tmp, &domain->devices) {
2723                 info = list_entry(entry, struct device_domain_info, link);
2724                 if (info->bus == pdev->bus->number &&
2725                     info->devfn == pdev->devfn) {
2726                         list_del(&info->link);
2727                         list_del(&info->global);
2728                         if (info->dev)
2729                                 info->dev->dev.archdata.iommu = NULL;
2730                         spin_unlock_irqrestore(&device_domain_lock, flags);
2731
2732                         iommu_detach_dev(iommu, info->bus, info->devfn);
2733                         free_devinfo_mem(info);
2734
2735                         spin_lock_irqsave(&device_domain_lock, flags);
2736
2737                         if (found)
2738                                 break;
2739                         else
2740                                 continue;
2741                 }
2742
2743                 /* if there is no other devices under the same iommu
2744                  * owned by this domain, clear this iommu in iommu_bmp
2745                  * update iommu count and coherency
2746                  */
2747                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2748                         found = 1;
2749         }
2750
2751         if (found == 0) {
2752                 unsigned long tmp_flags;
2753                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2754                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2755                 domain->iommu_count--;
2756                 domain_update_iommu_coherency(domain);
2757                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2758         }
2759
2760         spin_unlock_irqrestore(&device_domain_lock, flags);
2761 }
2762
2763 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2764 {
2765         struct device_domain_info *info;
2766         struct intel_iommu *iommu;
2767         unsigned long flags1, flags2;
2768
2769         spin_lock_irqsave(&device_domain_lock, flags1);
2770         while (!list_empty(&domain->devices)) {
2771                 info = list_entry(domain->devices.next,
2772                         struct device_domain_info, link);
2773                 list_del(&info->link);
2774                 list_del(&info->global);
2775                 if (info->dev)
2776                         info->dev->dev.archdata.iommu = NULL;
2777
2778                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2779
2780                 iommu = device_to_iommu(info->bus, info->devfn);
2781                 iommu_detach_dev(iommu, info->bus, info->devfn);
2782
2783                 /* clear this iommu in iommu_bmp, update iommu count
2784                  * and coherency
2785                  */
2786                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2787                 if (test_and_clear_bit(iommu->seq_id,
2788                                        &domain->iommu_bmp)) {
2789                         domain->iommu_count--;
2790                         domain_update_iommu_coherency(domain);
2791                 }
2792                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2793
2794                 free_devinfo_mem(info);
2795                 spin_lock_irqsave(&device_domain_lock, flags1);
2796         }
2797         spin_unlock_irqrestore(&device_domain_lock, flags1);
2798 }
2799
2800 /* domain id for virtual machine, it won't be set in context */
2801 static unsigned long vm_domid;
2802
2803 static struct dmar_domain *iommu_alloc_vm_domain(void)
2804 {
2805         struct dmar_domain *domain;
2806
2807         domain = alloc_domain_mem();
2808         if (!domain)
2809                 return NULL;
2810
2811         domain->id = vm_domid++;
2812         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2813         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2814
2815         return domain;
2816 }
2817
2818 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2819 {
2820         int adjust_width;
2821
2822         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2823         spin_lock_init(&domain->mapping_lock);
2824         spin_lock_init(&domain->iommu_lock);
2825
2826         domain_reserve_special_ranges(domain);
2827
2828         /* calculate AGAW */
2829         domain->gaw = guest_width;
2830         adjust_width = guestwidth_to_adjustwidth(guest_width);
2831         domain->agaw = width_to_agaw(adjust_width);
2832
2833         INIT_LIST_HEAD(&domain->devices);
2834
2835         domain->iommu_count = 0;
2836         domain->iommu_coherency = 0;
2837
2838         /* always allocate the top pgd */
2839         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2840         if (!domain->pgd)
2841                 return -ENOMEM;
2842         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2843         return 0;
2844 }
2845
2846 static void iommu_free_vm_domain(struct dmar_domain *domain)
2847 {
2848         unsigned long flags;
2849         struct dmar_drhd_unit *drhd;
2850         struct intel_iommu *iommu;
2851         unsigned long i;
2852         unsigned long ndomains;
2853
2854         for_each_drhd_unit(drhd) {
2855                 if (drhd->ignored)
2856                         continue;
2857                 iommu = drhd->iommu;
2858
2859                 ndomains = cap_ndoms(iommu->cap);
2860                 i = find_first_bit(iommu->domain_ids, ndomains);
2861                 for (; i < ndomains; ) {
2862                         if (iommu->domains[i] == domain) {
2863                                 spin_lock_irqsave(&iommu->lock, flags);
2864                                 clear_bit(i, iommu->domain_ids);
2865                                 iommu->domains[i] = NULL;
2866                                 spin_unlock_irqrestore(&iommu->lock, flags);
2867                                 break;
2868                         }
2869                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2870                 }
2871         }
2872 }
2873
2874 static void vm_domain_exit(struct dmar_domain *domain)
2875 {
2876         u64 end;
2877
2878         /* Domain 0 is reserved, so dont process it */
2879         if (!domain)
2880                 return;
2881
2882         vm_domain_remove_all_dev_info(domain);
2883         /* destroy iovas */
2884         put_iova_domain(&domain->iovad);
2885         end = DOMAIN_MAX_ADDR(domain->gaw);
2886         end = end & (~VTD_PAGE_MASK);
2887
2888         /* clear ptes */
2889         dma_pte_clear_range(domain, 0, end);
2890
2891         /* free page tables */
2892         dma_pte_free_pagetable(domain, 0, end);
2893
2894         iommu_free_vm_domain(domain);
2895         free_domain_mem(domain);
2896 }
2897
2898 void intel_iommu_domain_exit(struct dmar_domain *domain)
2899 {
2900         u64 end;
2901
2902         /* Domain 0 is reserved, so dont process it */
2903         if (!domain)
2904                 return;
2905
2906         end = DOMAIN_MAX_ADDR(domain->gaw);
2907         end = end & (~VTD_PAGE_MASK);
2908
2909         /* clear ptes */
2910         dma_pte_clear_range(domain, 0, end);
2911
2912         /* free page tables */
2913         dma_pte_free_pagetable(domain, 0, end);
2914
2915         iommu_free_domain(domain);
2916         free_domain_mem(domain);
2917 }
2918 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2919
2920 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2921 {
2922         struct dmar_drhd_unit *drhd;
2923         struct dmar_domain *domain;
2924         struct intel_iommu *iommu;
2925
2926         drhd = dmar_find_matched_drhd_unit(pdev);
2927         if (!drhd) {
2928                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2929                 return NULL;
2930         }
2931
2932         iommu = drhd->iommu;
2933         if (!iommu) {
2934                 printk(KERN_ERR
2935                         "intel_iommu_domain_alloc: iommu == NULL\n");
2936                 return NULL;
2937         }
2938         domain = iommu_alloc_domain(iommu);
2939         if (!domain) {
2940                 printk(KERN_ERR
2941                         "intel_iommu_domain_alloc: domain == NULL\n");
2942                 return NULL;
2943         }
2944         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2945                 printk(KERN_ERR
2946                         "intel_iommu_domain_alloc: domain_init() failed\n");
2947                 intel_iommu_domain_exit(domain);
2948                 return NULL;
2949         }
2950         return domain;
2951 }
2952 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2953
2954 int intel_iommu_context_mapping(
2955         struct dmar_domain *domain, struct pci_dev *pdev)
2956 {
2957         int rc;
2958         rc = domain_context_mapping(domain, pdev);
2959         return rc;
2960 }
2961 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2962
2963 int intel_iommu_page_mapping(
2964         struct dmar_domain *domain, dma_addr_t iova,
2965         u64 hpa, size_t size, int prot)
2966 {
2967         int rc;
2968         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2969         return rc;
2970 }
2971 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2972
2973 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2974 {
2975         struct intel_iommu *iommu;
2976
2977         iommu = device_to_iommu(bus, devfn);
2978         iommu_detach_dev(iommu, bus, devfn);
2979 }
2980 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2981
2982 struct dmar_domain *
2983 intel_iommu_find_domain(struct pci_dev *pdev)
2984 {
2985         return find_domain(pdev);
2986 }
2987 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2988
2989 int intel_iommu_found(void)
2990 {
2991         return g_num_of_iommus;
2992 }
2993 EXPORT_SYMBOL_GPL(intel_iommu_found);
2994
2995 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2996 {
2997         struct dma_pte *pte;
2998         u64 pfn;
2999
3000         pfn = 0;
3001         pte = addr_to_dma_pte(domain, iova);
3002
3003         if (pte)
3004                 pfn = dma_pte_addr(pte);
3005
3006         return pfn >> VTD_PAGE_SHIFT;
3007 }
3008 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);