]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/pci/intel-iommu.c
Merge branch 'linus' into core/iommu
[linux-2.6-omap-h63xx.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 static int rwbf_quirk;
65
66 /*
67  * 0: Present
68  * 1-11: Reserved
69  * 12-63: Context Ptr (12 - (haw-1))
70  * 64-127: Reserved
71  */
72 struct root_entry {
73         u64     val;
74         u64     rsvd1;
75 };
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
78 {
79         return (root->val & 1);
80 }
81 static inline void set_root_present(struct root_entry *root)
82 {
83         root->val |= 1;
84 }
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
86 {
87         root->val |= value & VTD_PAGE_MASK;
88 }
89
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
92 {
93         return (struct context_entry *)
94                 (root_present(root)?phys_to_virt(
95                 root->val & VTD_PAGE_MASK) :
96                 NULL);
97 }
98
99 /*
100  * low 64 bits:
101  * 0: present
102  * 1: fault processing disable
103  * 2-3: translation type
104  * 12-63: address space root
105  * high 64 bits:
106  * 0-2: address width
107  * 3-6: aval
108  * 8-23: domain id
109  */
110 struct context_entry {
111         u64 lo;
112         u64 hi;
113 };
114
115 static inline bool context_present(struct context_entry *context)
116 {
117         return (context->lo & 1);
118 }
119 static inline void context_set_present(struct context_entry *context)
120 {
121         context->lo |= 1;
122 }
123
124 static inline void context_set_fault_enable(struct context_entry *context)
125 {
126         context->lo &= (((u64)-1) << 2) | 1;
127 }
128
129 #define CONTEXT_TT_MULTI_LEVEL 0
130
131 static inline void context_set_translation_type(struct context_entry *context,
132                                                 unsigned long value)
133 {
134         context->lo &= (((u64)-1) << 4) | 3;
135         context->lo |= (value & 3) << 2;
136 }
137
138 static inline void context_set_address_root(struct context_entry *context,
139                                             unsigned long value)
140 {
141         context->lo |= value & VTD_PAGE_MASK;
142 }
143
144 static inline void context_set_address_width(struct context_entry *context,
145                                              unsigned long value)
146 {
147         context->hi |= value & 7;
148 }
149
150 static inline void context_set_domain_id(struct context_entry *context,
151                                          unsigned long value)
152 {
153         context->hi |= (value & ((1 << 16) - 1)) << 8;
154 }
155
156 static inline void context_clear_entry(struct context_entry *context)
157 {
158         context->lo = 0;
159         context->hi = 0;
160 }
161
162 /*
163  * 0: readable
164  * 1: writable
165  * 2-6: reserved
166  * 7: super page
167  * 8-11: available
168  * 12-63: Host physcial address
169  */
170 struct dma_pte {
171         u64 val;
172 };
173
174 static inline void dma_clear_pte(struct dma_pte *pte)
175 {
176         pte->val = 0;
177 }
178
179 static inline void dma_set_pte_readable(struct dma_pte *pte)
180 {
181         pte->val |= DMA_PTE_READ;
182 }
183
184 static inline void dma_set_pte_writable(struct dma_pte *pte)
185 {
186         pte->val |= DMA_PTE_WRITE;
187 }
188
189 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
190 {
191         pte->val = (pte->val & ~3) | (prot & 3);
192 }
193
194 static inline u64 dma_pte_addr(struct dma_pte *pte)
195 {
196         return (pte->val & VTD_PAGE_MASK);
197 }
198
199 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
200 {
201         pte->val |= (addr & VTD_PAGE_MASK);
202 }
203
204 static inline bool dma_pte_present(struct dma_pte *pte)
205 {
206         return (pte->val & 3) != 0;
207 }
208
209 /* devices under the same p2p bridge are owned in one domain */
210 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
211
212 /* domain represents a virtual machine, more than one devices
213  * across iommus may be owned in one domain, e.g. kvm guest.
214  */
215 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
216
217 struct dmar_domain {
218         int     id;                     /* domain id */
219         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
220
221         struct list_head devices;       /* all devices' list */
222         struct iova_domain iovad;       /* iova's that belong to this domain */
223
224         struct dma_pte  *pgd;           /* virtual address */
225         spinlock_t      mapping_lock;   /* page table lock */
226         int             gaw;            /* max guest address width */
227
228         /* adjusted guest address width, 0 is level 2 30-bit */
229         int             agaw;
230
231         int             flags;          /* flags to find out type of domain */
232
233         int             iommu_coherency;/* indicate coherency of iommu access */
234         int             iommu_count;    /* reference count of iommu */
235         spinlock_t      iommu_lock;     /* protect iommu set in domain */
236         u64             max_addr;       /* maximum mapped address */
237 };
238
239 /* PCI domain-device relationship */
240 struct device_domain_info {
241         struct list_head link;  /* link to domain siblings */
242         struct list_head global; /* link to global list */
243         u8 bus;                 /* PCI bus numer */
244         u8 devfn;               /* PCI devfn number */
245         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
246         struct dmar_domain *domain; /* pointer to domain */
247 };
248
249 static void flush_unmaps_timeout(unsigned long data);
250
251 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
252
253 #define HIGH_WATER_MARK 250
254 struct deferred_flush_tables {
255         int next;
256         struct iova *iova[HIGH_WATER_MARK];
257         struct dmar_domain *domain[HIGH_WATER_MARK];
258 };
259
260 static struct deferred_flush_tables *deferred_flush;
261
262 /* bitmap for indexing intel_iommus */
263 static int g_num_of_iommus;
264
265 static DEFINE_SPINLOCK(async_umap_flush_lock);
266 static LIST_HEAD(unmaps_to_do);
267
268 static int timer_on;
269 static long list_size;
270
271 static void domain_remove_dev_info(struct dmar_domain *domain);
272
273 #ifdef CONFIG_DMAR_DEFAULT_ON
274 int dmar_disabled = 0;
275 #else
276 int dmar_disabled = 1;
277 #endif /*CONFIG_DMAR_DEFAULT_ON*/
278
279 static int __initdata dmar_map_gfx = 1;
280 static int dmar_forcedac;
281 static int intel_iommu_strict;
282
283 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
284 static DEFINE_SPINLOCK(device_domain_lock);
285 static LIST_HEAD(device_domain_list);
286
287 static struct iommu_ops intel_iommu_ops;
288
289 static int __init intel_iommu_setup(char *str)
290 {
291         if (!str)
292                 return -EINVAL;
293         while (*str) {
294                 if (!strncmp(str, "on", 2)) {
295                         dmar_disabled = 0;
296                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
297                 } else if (!strncmp(str, "off", 3)) {
298                         dmar_disabled = 1;
299                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
300                 } else if (!strncmp(str, "igfx_off", 8)) {
301                         dmar_map_gfx = 0;
302                         printk(KERN_INFO
303                                 "Intel-IOMMU: disable GFX device mapping\n");
304                 } else if (!strncmp(str, "forcedac", 8)) {
305                         printk(KERN_INFO
306                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
307                         dmar_forcedac = 1;
308                 } else if (!strncmp(str, "strict", 6)) {
309                         printk(KERN_INFO
310                                 "Intel-IOMMU: disable batched IOTLB flush\n");
311                         intel_iommu_strict = 1;
312                 }
313
314                 str += strcspn(str, ",");
315                 while (*str == ',')
316                         str++;
317         }
318         return 0;
319 }
320 __setup("intel_iommu=", intel_iommu_setup);
321
322 static struct kmem_cache *iommu_domain_cache;
323 static struct kmem_cache *iommu_devinfo_cache;
324 static struct kmem_cache *iommu_iova_cache;
325
326 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
327 {
328         unsigned int flags;
329         void *vaddr;
330
331         /* trying to avoid low memory issues */
332         flags = current->flags & PF_MEMALLOC;
333         current->flags |= PF_MEMALLOC;
334         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
335         current->flags &= (~PF_MEMALLOC | flags);
336         return vaddr;
337 }
338
339
340 static inline void *alloc_pgtable_page(void)
341 {
342         unsigned int flags;
343         void *vaddr;
344
345         /* trying to avoid low memory issues */
346         flags = current->flags & PF_MEMALLOC;
347         current->flags |= PF_MEMALLOC;
348         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
349         current->flags &= (~PF_MEMALLOC | flags);
350         return vaddr;
351 }
352
353 static inline void free_pgtable_page(void *vaddr)
354 {
355         free_page((unsigned long)vaddr);
356 }
357
358 static inline void *alloc_domain_mem(void)
359 {
360         return iommu_kmem_cache_alloc(iommu_domain_cache);
361 }
362
363 static void free_domain_mem(void *vaddr)
364 {
365         kmem_cache_free(iommu_domain_cache, vaddr);
366 }
367
368 static inline void * alloc_devinfo_mem(void)
369 {
370         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
371 }
372
373 static inline void free_devinfo_mem(void *vaddr)
374 {
375         kmem_cache_free(iommu_devinfo_cache, vaddr);
376 }
377
378 struct iova *alloc_iova_mem(void)
379 {
380         return iommu_kmem_cache_alloc(iommu_iova_cache);
381 }
382
383 void free_iova_mem(struct iova *iova)
384 {
385         kmem_cache_free(iommu_iova_cache, iova);
386 }
387
388
389 static inline int width_to_agaw(int width);
390
391 /* calculate agaw for each iommu.
392  * "SAGAW" may be different across iommus, use a default agaw, and
393  * get a supported less agaw for iommus that don't support the default agaw.
394  */
395 int iommu_calculate_agaw(struct intel_iommu *iommu)
396 {
397         unsigned long sagaw;
398         int agaw = -1;
399
400         sagaw = cap_sagaw(iommu->cap);
401         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
402              agaw >= 0; agaw--) {
403                 if (test_bit(agaw, &sagaw))
404                         break;
405         }
406
407         return agaw;
408 }
409
410 /* in native case, each domain is related to only one iommu */
411 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
412 {
413         int iommu_id;
414
415         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
416
417         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
418         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
419                 return NULL;
420
421         return g_iommus[iommu_id];
422 }
423
424 /* "Coherency" capability may be different across iommus */
425 static void domain_update_iommu_coherency(struct dmar_domain *domain)
426 {
427         int i;
428
429         domain->iommu_coherency = 1;
430
431         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
432         for (; i < g_num_of_iommus; ) {
433                 if (!ecap_coherent(g_iommus[i]->ecap)) {
434                         domain->iommu_coherency = 0;
435                         break;
436                 }
437                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
438         }
439 }
440
441 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
442 {
443         struct dmar_drhd_unit *drhd = NULL;
444         int i;
445
446         for_each_drhd_unit(drhd) {
447                 if (drhd->ignored)
448                         continue;
449
450                 for (i = 0; i < drhd->devices_cnt; i++)
451                         if (drhd->devices[i] &&
452                             drhd->devices[i]->bus->number == bus &&
453                             drhd->devices[i]->devfn == devfn)
454                                 return drhd->iommu;
455
456                 if (drhd->include_all)
457                         return drhd->iommu;
458         }
459
460         return NULL;
461 }
462
463 static void domain_flush_cache(struct dmar_domain *domain,
464                                void *addr, int size)
465 {
466         if (!domain->iommu_coherency)
467                 clflush_cache_range(addr, size);
468 }
469
470 /* Gets context entry for a given bus and devfn */
471 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
472                 u8 bus, u8 devfn)
473 {
474         struct root_entry *root;
475         struct context_entry *context;
476         unsigned long phy_addr;
477         unsigned long flags;
478
479         spin_lock_irqsave(&iommu->lock, flags);
480         root = &iommu->root_entry[bus];
481         context = get_context_addr_from_root(root);
482         if (!context) {
483                 context = (struct context_entry *)alloc_pgtable_page();
484                 if (!context) {
485                         spin_unlock_irqrestore(&iommu->lock, flags);
486                         return NULL;
487                 }
488                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
489                 phy_addr = virt_to_phys((void *)context);
490                 set_root_value(root, phy_addr);
491                 set_root_present(root);
492                 __iommu_flush_cache(iommu, root, sizeof(*root));
493         }
494         spin_unlock_irqrestore(&iommu->lock, flags);
495         return &context[devfn];
496 }
497
498 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
499 {
500         struct root_entry *root;
501         struct context_entry *context;
502         int ret;
503         unsigned long flags;
504
505         spin_lock_irqsave(&iommu->lock, flags);
506         root = &iommu->root_entry[bus];
507         context = get_context_addr_from_root(root);
508         if (!context) {
509                 ret = 0;
510                 goto out;
511         }
512         ret = context_present(&context[devfn]);
513 out:
514         spin_unlock_irqrestore(&iommu->lock, flags);
515         return ret;
516 }
517
518 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
519 {
520         struct root_entry *root;
521         struct context_entry *context;
522         unsigned long flags;
523
524         spin_lock_irqsave(&iommu->lock, flags);
525         root = &iommu->root_entry[bus];
526         context = get_context_addr_from_root(root);
527         if (context) {
528                 context_clear_entry(&context[devfn]);
529                 __iommu_flush_cache(iommu, &context[devfn], \
530                         sizeof(*context));
531         }
532         spin_unlock_irqrestore(&iommu->lock, flags);
533 }
534
535 static void free_context_table(struct intel_iommu *iommu)
536 {
537         struct root_entry *root;
538         int i;
539         unsigned long flags;
540         struct context_entry *context;
541
542         spin_lock_irqsave(&iommu->lock, flags);
543         if (!iommu->root_entry) {
544                 goto out;
545         }
546         for (i = 0; i < ROOT_ENTRY_NR; i++) {
547                 root = &iommu->root_entry[i];
548                 context = get_context_addr_from_root(root);
549                 if (context)
550                         free_pgtable_page(context);
551         }
552         free_pgtable_page(iommu->root_entry);
553         iommu->root_entry = NULL;
554 out:
555         spin_unlock_irqrestore(&iommu->lock, flags);
556 }
557
558 /* page table handling */
559 #define LEVEL_STRIDE            (9)
560 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
561
562 static inline int agaw_to_level(int agaw)
563 {
564         return agaw + 2;
565 }
566
567 static inline int agaw_to_width(int agaw)
568 {
569         return 30 + agaw * LEVEL_STRIDE;
570
571 }
572
573 static inline int width_to_agaw(int width)
574 {
575         return (width - 30) / LEVEL_STRIDE;
576 }
577
578 static inline unsigned int level_to_offset_bits(int level)
579 {
580         return (12 + (level - 1) * LEVEL_STRIDE);
581 }
582
583 static inline int address_level_offset(u64 addr, int level)
584 {
585         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
586 }
587
588 static inline u64 level_mask(int level)
589 {
590         return ((u64)-1 << level_to_offset_bits(level));
591 }
592
593 static inline u64 level_size(int level)
594 {
595         return ((u64)1 << level_to_offset_bits(level));
596 }
597
598 static inline u64 align_to_level(u64 addr, int level)
599 {
600         return ((addr + level_size(level) - 1) & level_mask(level));
601 }
602
603 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
604 {
605         int addr_width = agaw_to_width(domain->agaw);
606         struct dma_pte *parent, *pte = NULL;
607         int level = agaw_to_level(domain->agaw);
608         int offset;
609         unsigned long flags;
610
611         BUG_ON(!domain->pgd);
612
613         addr &= (((u64)1) << addr_width) - 1;
614         parent = domain->pgd;
615
616         spin_lock_irqsave(&domain->mapping_lock, flags);
617         while (level > 0) {
618                 void *tmp_page;
619
620                 offset = address_level_offset(addr, level);
621                 pte = &parent[offset];
622                 if (level == 1)
623                         break;
624
625                 if (!dma_pte_present(pte)) {
626                         tmp_page = alloc_pgtable_page();
627
628                         if (!tmp_page) {
629                                 spin_unlock_irqrestore(&domain->mapping_lock,
630                                         flags);
631                                 return NULL;
632                         }
633                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
634                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
635                         /*
636                          * high level table always sets r/w, last level page
637                          * table control read/write
638                          */
639                         dma_set_pte_readable(pte);
640                         dma_set_pte_writable(pte);
641                         domain_flush_cache(domain, pte, sizeof(*pte));
642                 }
643                 parent = phys_to_virt(dma_pte_addr(pte));
644                 level--;
645         }
646
647         spin_unlock_irqrestore(&domain->mapping_lock, flags);
648         return pte;
649 }
650
651 /* return address's pte at specific level */
652 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
653                 int level)
654 {
655         struct dma_pte *parent, *pte = NULL;
656         int total = agaw_to_level(domain->agaw);
657         int offset;
658
659         parent = domain->pgd;
660         while (level <= total) {
661                 offset = address_level_offset(addr, total);
662                 pte = &parent[offset];
663                 if (level == total)
664                         return pte;
665
666                 if (!dma_pte_present(pte))
667                         break;
668                 parent = phys_to_virt(dma_pte_addr(pte));
669                 total--;
670         }
671         return NULL;
672 }
673
674 /* clear one page's page table */
675 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
676 {
677         struct dma_pte *pte = NULL;
678
679         /* get last level pte */
680         pte = dma_addr_level_pte(domain, addr, 1);
681
682         if (pte) {
683                 dma_clear_pte(pte);
684                 domain_flush_cache(domain, pte, sizeof(*pte));
685         }
686 }
687
688 /* clear last level pte, a tlb flush should be followed */
689 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
690 {
691         int addr_width = agaw_to_width(domain->agaw);
692
693         start &= (((u64)1) << addr_width) - 1;
694         end &= (((u64)1) << addr_width) - 1;
695         /* in case it's partial page */
696         start = PAGE_ALIGN(start);
697         end &= PAGE_MASK;
698
699         /* we don't need lock here, nobody else touches the iova range */
700         while (start < end) {
701                 dma_pte_clear_one(domain, start);
702                 start += VTD_PAGE_SIZE;
703         }
704 }
705
706 /* free page table pages. last level pte should already be cleared */
707 static void dma_pte_free_pagetable(struct dmar_domain *domain,
708         u64 start, u64 end)
709 {
710         int addr_width = agaw_to_width(domain->agaw);
711         struct dma_pte *pte;
712         int total = agaw_to_level(domain->agaw);
713         int level;
714         u64 tmp;
715
716         start &= (((u64)1) << addr_width) - 1;
717         end &= (((u64)1) << addr_width) - 1;
718
719         /* we don't need lock here, nobody else touches the iova range */
720         level = 2;
721         while (level <= total) {
722                 tmp = align_to_level(start, level);
723                 if (tmp >= end || (tmp + level_size(level) > end))
724                         return;
725
726                 while (tmp < end) {
727                         pte = dma_addr_level_pte(domain, tmp, level);
728                         if (pte) {
729                                 free_pgtable_page(
730                                         phys_to_virt(dma_pte_addr(pte)));
731                                 dma_clear_pte(pte);
732                                 domain_flush_cache(domain, pte, sizeof(*pte));
733                         }
734                         tmp += level_size(level);
735                 }
736                 level++;
737         }
738         /* free pgd */
739         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
740                 free_pgtable_page(domain->pgd);
741                 domain->pgd = NULL;
742         }
743 }
744
745 /* iommu handling */
746 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
747 {
748         struct root_entry *root;
749         unsigned long flags;
750
751         root = (struct root_entry *)alloc_pgtable_page();
752         if (!root)
753                 return -ENOMEM;
754
755         __iommu_flush_cache(iommu, root, ROOT_SIZE);
756
757         spin_lock_irqsave(&iommu->lock, flags);
758         iommu->root_entry = root;
759         spin_unlock_irqrestore(&iommu->lock, flags);
760
761         return 0;
762 }
763
764 static void iommu_set_root_entry(struct intel_iommu *iommu)
765 {
766         void *addr;
767         u32 cmd, sts;
768         unsigned long flag;
769
770         addr = iommu->root_entry;
771
772         spin_lock_irqsave(&iommu->register_lock, flag);
773         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
774
775         cmd = iommu->gcmd | DMA_GCMD_SRTP;
776         writel(cmd, iommu->reg + DMAR_GCMD_REG);
777
778         /* Make sure hardware complete it */
779         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
780                 readl, (sts & DMA_GSTS_RTPS), sts);
781
782         spin_unlock_irqrestore(&iommu->register_lock, flag);
783 }
784
785 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
786 {
787         u32 val;
788         unsigned long flag;
789
790         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
791                 return;
792         val = iommu->gcmd | DMA_GCMD_WBF;
793
794         spin_lock_irqsave(&iommu->register_lock, flag);
795         writel(val, iommu->reg + DMAR_GCMD_REG);
796
797         /* Make sure hardware complete it */
798         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
799                         readl, (!(val & DMA_GSTS_WBFS)), val);
800
801         spin_unlock_irqrestore(&iommu->register_lock, flag);
802 }
803
804 /* return value determine if we need a write buffer flush */
805 static int __iommu_flush_context(struct intel_iommu *iommu,
806         u16 did, u16 source_id, u8 function_mask, u64 type,
807         int non_present_entry_flush)
808 {
809         u64 val = 0;
810         unsigned long flag;
811
812         /*
813          * In the non-present entry flush case, if hardware doesn't cache
814          * non-present entry we do nothing and if hardware cache non-present
815          * entry, we flush entries of domain 0 (the domain id is used to cache
816          * any non-present entries)
817          */
818         if (non_present_entry_flush) {
819                 if (!cap_caching_mode(iommu->cap))
820                         return 1;
821                 else
822                         did = 0;
823         }
824
825         switch (type) {
826         case DMA_CCMD_GLOBAL_INVL:
827                 val = DMA_CCMD_GLOBAL_INVL;
828                 break;
829         case DMA_CCMD_DOMAIN_INVL:
830                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
831                 break;
832         case DMA_CCMD_DEVICE_INVL:
833                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
834                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
835                 break;
836         default:
837                 BUG();
838         }
839         val |= DMA_CCMD_ICC;
840
841         spin_lock_irqsave(&iommu->register_lock, flag);
842         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
843
844         /* Make sure hardware complete it */
845         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
846                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
847
848         spin_unlock_irqrestore(&iommu->register_lock, flag);
849
850         /* flush context entry will implicitly flush write buffer */
851         return 0;
852 }
853
854 /* return value determine if we need a write buffer flush */
855 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
856         u64 addr, unsigned int size_order, u64 type,
857         int non_present_entry_flush)
858 {
859         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
860         u64 val = 0, val_iva = 0;
861         unsigned long flag;
862
863         /*
864          * In the non-present entry flush case, if hardware doesn't cache
865          * non-present entry we do nothing and if hardware cache non-present
866          * entry, we flush entries of domain 0 (the domain id is used to cache
867          * any non-present entries)
868          */
869         if (non_present_entry_flush) {
870                 if (!cap_caching_mode(iommu->cap))
871                         return 1;
872                 else
873                         did = 0;
874         }
875
876         switch (type) {
877         case DMA_TLB_GLOBAL_FLUSH:
878                 /* global flush doesn't need set IVA_REG */
879                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
880                 break;
881         case DMA_TLB_DSI_FLUSH:
882                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
883                 break;
884         case DMA_TLB_PSI_FLUSH:
885                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
886                 /* Note: always flush non-leaf currently */
887                 val_iva = size_order | addr;
888                 break;
889         default:
890                 BUG();
891         }
892         /* Note: set drain read/write */
893 #if 0
894         /*
895          * This is probably to be super secure.. Looks like we can
896          * ignore it without any impact.
897          */
898         if (cap_read_drain(iommu->cap))
899                 val |= DMA_TLB_READ_DRAIN;
900 #endif
901         if (cap_write_drain(iommu->cap))
902                 val |= DMA_TLB_WRITE_DRAIN;
903
904         spin_lock_irqsave(&iommu->register_lock, flag);
905         /* Note: Only uses first TLB reg currently */
906         if (val_iva)
907                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
908         dmar_writeq(iommu->reg + tlb_offset + 8, val);
909
910         /* Make sure hardware complete it */
911         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
912                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
913
914         spin_unlock_irqrestore(&iommu->register_lock, flag);
915
916         /* check IOTLB invalidation granularity */
917         if (DMA_TLB_IAIG(val) == 0)
918                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
919         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
920                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
921                         (unsigned long long)DMA_TLB_IIRG(type),
922                         (unsigned long long)DMA_TLB_IAIG(val));
923         /* flush iotlb entry will implicitly flush write buffer */
924         return 0;
925 }
926
927 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
928         u64 addr, unsigned int pages, int non_present_entry_flush)
929 {
930         unsigned int mask;
931
932         BUG_ON(addr & (~VTD_PAGE_MASK));
933         BUG_ON(pages == 0);
934
935         /* Fallback to domain selective flush if no PSI support */
936         if (!cap_pgsel_inv(iommu->cap))
937                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
938                                                 DMA_TLB_DSI_FLUSH,
939                                                 non_present_entry_flush);
940
941         /*
942          * PSI requires page size to be 2 ^ x, and the base address is naturally
943          * aligned to the size
944          */
945         mask = ilog2(__roundup_pow_of_two(pages));
946         /* Fallback to domain selective flush if size is too big */
947         if (mask > cap_max_amask_val(iommu->cap))
948                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
949                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
950
951         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
952                                         DMA_TLB_PSI_FLUSH,
953                                         non_present_entry_flush);
954 }
955
956 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
957 {
958         u32 pmen;
959         unsigned long flags;
960
961         spin_lock_irqsave(&iommu->register_lock, flags);
962         pmen = readl(iommu->reg + DMAR_PMEN_REG);
963         pmen &= ~DMA_PMEN_EPM;
964         writel(pmen, iommu->reg + DMAR_PMEN_REG);
965
966         /* wait for the protected region status bit to clear */
967         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
968                 readl, !(pmen & DMA_PMEN_PRS), pmen);
969
970         spin_unlock_irqrestore(&iommu->register_lock, flags);
971 }
972
973 static int iommu_enable_translation(struct intel_iommu *iommu)
974 {
975         u32 sts;
976         unsigned long flags;
977
978         spin_lock_irqsave(&iommu->register_lock, flags);
979         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
980
981         /* Make sure hardware complete it */
982         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
983                 readl, (sts & DMA_GSTS_TES), sts);
984
985         iommu->gcmd |= DMA_GCMD_TE;
986         spin_unlock_irqrestore(&iommu->register_lock, flags);
987         return 0;
988 }
989
990 static int iommu_disable_translation(struct intel_iommu *iommu)
991 {
992         u32 sts;
993         unsigned long flag;
994
995         spin_lock_irqsave(&iommu->register_lock, flag);
996         iommu->gcmd &= ~DMA_GCMD_TE;
997         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
998
999         /* Make sure hardware complete it */
1000         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1001                 readl, (!(sts & DMA_GSTS_TES)), sts);
1002
1003         spin_unlock_irqrestore(&iommu->register_lock, flag);
1004         return 0;
1005 }
1006
1007 /* iommu interrupt handling. Most stuff are MSI-like. */
1008
1009 static const char *fault_reason_strings[] =
1010 {
1011         "Software",
1012         "Present bit in root entry is clear",
1013         "Present bit in context entry is clear",
1014         "Invalid context entry",
1015         "Access beyond MGAW",
1016         "PTE Write access is not set",
1017         "PTE Read access is not set",
1018         "Next page table ptr is invalid",
1019         "Root table address invalid",
1020         "Context table ptr is invalid",
1021         "non-zero reserved fields in RTP",
1022         "non-zero reserved fields in CTP",
1023         "non-zero reserved fields in PTE",
1024 };
1025 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1026
1027 const char *dmar_get_fault_reason(u8 fault_reason)
1028 {
1029         if (fault_reason > MAX_FAULT_REASON_IDX)
1030                 return "Unknown";
1031         else
1032                 return fault_reason_strings[fault_reason];
1033 }
1034
1035 void dmar_msi_unmask(unsigned int irq)
1036 {
1037         struct intel_iommu *iommu = get_irq_data(irq);
1038         unsigned long flag;
1039
1040         /* unmask it */
1041         spin_lock_irqsave(&iommu->register_lock, flag);
1042         writel(0, iommu->reg + DMAR_FECTL_REG);
1043         /* Read a reg to force flush the post write */
1044         readl(iommu->reg + DMAR_FECTL_REG);
1045         spin_unlock_irqrestore(&iommu->register_lock, flag);
1046 }
1047
1048 void dmar_msi_mask(unsigned int irq)
1049 {
1050         unsigned long flag;
1051         struct intel_iommu *iommu = get_irq_data(irq);
1052
1053         /* mask it */
1054         spin_lock_irqsave(&iommu->register_lock, flag);
1055         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1056         /* Read a reg to force flush the post write */
1057         readl(iommu->reg + DMAR_FECTL_REG);
1058         spin_unlock_irqrestore(&iommu->register_lock, flag);
1059 }
1060
1061 void dmar_msi_write(int irq, struct msi_msg *msg)
1062 {
1063         struct intel_iommu *iommu = get_irq_data(irq);
1064         unsigned long flag;
1065
1066         spin_lock_irqsave(&iommu->register_lock, flag);
1067         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1068         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1069         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1070         spin_unlock_irqrestore(&iommu->register_lock, flag);
1071 }
1072
1073 void dmar_msi_read(int irq, struct msi_msg *msg)
1074 {
1075         struct intel_iommu *iommu = get_irq_data(irq);
1076         unsigned long flag;
1077
1078         spin_lock_irqsave(&iommu->register_lock, flag);
1079         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1080         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1081         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1082         spin_unlock_irqrestore(&iommu->register_lock, flag);
1083 }
1084
1085 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1086                 u8 fault_reason, u16 source_id, unsigned long long addr)
1087 {
1088         const char *reason;
1089
1090         reason = dmar_get_fault_reason(fault_reason);
1091
1092         printk(KERN_ERR
1093                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1094                 "fault addr %llx \n"
1095                 "DMAR:[fault reason %02d] %s\n",
1096                 (type ? "DMA Read" : "DMA Write"),
1097                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1098                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1099         return 0;
1100 }
1101
1102 #define PRIMARY_FAULT_REG_LEN (16)
1103 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1104 {
1105         struct intel_iommu *iommu = dev_id;
1106         int reg, fault_index;
1107         u32 fault_status;
1108         unsigned long flag;
1109
1110         spin_lock_irqsave(&iommu->register_lock, flag);
1111         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1112
1113         /* TBD: ignore advanced fault log currently */
1114         if (!(fault_status & DMA_FSTS_PPF))
1115                 goto clear_overflow;
1116
1117         fault_index = dma_fsts_fault_record_index(fault_status);
1118         reg = cap_fault_reg_offset(iommu->cap);
1119         while (1) {
1120                 u8 fault_reason;
1121                 u16 source_id;
1122                 u64 guest_addr;
1123                 int type;
1124                 u32 data;
1125
1126                 /* highest 32 bits */
1127                 data = readl(iommu->reg + reg +
1128                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1129                 if (!(data & DMA_FRCD_F))
1130                         break;
1131
1132                 fault_reason = dma_frcd_fault_reason(data);
1133                 type = dma_frcd_type(data);
1134
1135                 data = readl(iommu->reg + reg +
1136                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1137                 source_id = dma_frcd_source_id(data);
1138
1139                 guest_addr = dmar_readq(iommu->reg + reg +
1140                                 fault_index * PRIMARY_FAULT_REG_LEN);
1141                 guest_addr = dma_frcd_page_addr(guest_addr);
1142                 /* clear the fault */
1143                 writel(DMA_FRCD_F, iommu->reg + reg +
1144                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1145
1146                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1147
1148                 iommu_page_fault_do_one(iommu, type, fault_reason,
1149                                 source_id, guest_addr);
1150
1151                 fault_index++;
1152                 if (fault_index > cap_num_fault_regs(iommu->cap))
1153                         fault_index = 0;
1154                 spin_lock_irqsave(&iommu->register_lock, flag);
1155         }
1156 clear_overflow:
1157         /* clear primary fault overflow */
1158         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1159         if (fault_status & DMA_FSTS_PFO)
1160                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1161
1162         spin_unlock_irqrestore(&iommu->register_lock, flag);
1163         return IRQ_HANDLED;
1164 }
1165
1166 int dmar_set_interrupt(struct intel_iommu *iommu)
1167 {
1168         int irq, ret;
1169
1170         irq = create_irq();
1171         if (!irq) {
1172                 printk(KERN_ERR "IOMMU: no free vectors\n");
1173                 return -EINVAL;
1174         }
1175
1176         set_irq_data(irq, iommu);
1177         iommu->irq = irq;
1178
1179         ret = arch_setup_dmar_msi(irq);
1180         if (ret) {
1181                 set_irq_data(irq, NULL);
1182                 iommu->irq = 0;
1183                 destroy_irq(irq);
1184                 return 0;
1185         }
1186
1187         /* Force fault register is cleared */
1188         iommu_page_fault(irq, iommu);
1189
1190         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1191         if (ret)
1192                 printk(KERN_ERR "IOMMU: can't request irq\n");
1193         return ret;
1194 }
1195
1196 static int iommu_init_domains(struct intel_iommu *iommu)
1197 {
1198         unsigned long ndomains;
1199         unsigned long nlongs;
1200
1201         ndomains = cap_ndoms(iommu->cap);
1202         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1203         nlongs = BITS_TO_LONGS(ndomains);
1204
1205         /* TBD: there might be 64K domains,
1206          * consider other allocation for future chip
1207          */
1208         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1209         if (!iommu->domain_ids) {
1210                 printk(KERN_ERR "Allocating domain id array failed\n");
1211                 return -ENOMEM;
1212         }
1213         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1214                         GFP_KERNEL);
1215         if (!iommu->domains) {
1216                 printk(KERN_ERR "Allocating domain array failed\n");
1217                 kfree(iommu->domain_ids);
1218                 return -ENOMEM;
1219         }
1220
1221         spin_lock_init(&iommu->lock);
1222
1223         /*
1224          * if Caching mode is set, then invalid translations are tagged
1225          * with domainid 0. Hence we need to pre-allocate it.
1226          */
1227         if (cap_caching_mode(iommu->cap))
1228                 set_bit(0, iommu->domain_ids);
1229         return 0;
1230 }
1231
1232
1233 static void domain_exit(struct dmar_domain *domain);
1234 static void vm_domain_exit(struct dmar_domain *domain);
1235
1236 void free_dmar_iommu(struct intel_iommu *iommu)
1237 {
1238         struct dmar_domain *domain;
1239         int i;
1240         unsigned long flags;
1241
1242         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1243         for (; i < cap_ndoms(iommu->cap); ) {
1244                 domain = iommu->domains[i];
1245                 clear_bit(i, iommu->domain_ids);
1246
1247                 spin_lock_irqsave(&domain->iommu_lock, flags);
1248                 if (--domain->iommu_count == 0) {
1249                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1250                                 vm_domain_exit(domain);
1251                         else
1252                                 domain_exit(domain);
1253                 }
1254                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1255
1256                 i = find_next_bit(iommu->domain_ids,
1257                         cap_ndoms(iommu->cap), i+1);
1258         }
1259
1260         if (iommu->gcmd & DMA_GCMD_TE)
1261                 iommu_disable_translation(iommu);
1262
1263         if (iommu->irq) {
1264                 set_irq_data(iommu->irq, NULL);
1265                 /* This will mask the irq */
1266                 free_irq(iommu->irq, iommu);
1267                 destroy_irq(iommu->irq);
1268         }
1269
1270         kfree(iommu->domains);
1271         kfree(iommu->domain_ids);
1272
1273         g_iommus[iommu->seq_id] = NULL;
1274
1275         /* if all iommus are freed, free g_iommus */
1276         for (i = 0; i < g_num_of_iommus; i++) {
1277                 if (g_iommus[i])
1278                         break;
1279         }
1280
1281         if (i == g_num_of_iommus)
1282                 kfree(g_iommus);
1283
1284         /* free context mapping */
1285         free_context_table(iommu);
1286 }
1287
1288 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1289 {
1290         unsigned long num;
1291         unsigned long ndomains;
1292         struct dmar_domain *domain;
1293         unsigned long flags;
1294
1295         domain = alloc_domain_mem();
1296         if (!domain)
1297                 return NULL;
1298
1299         ndomains = cap_ndoms(iommu->cap);
1300
1301         spin_lock_irqsave(&iommu->lock, flags);
1302         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1303         if (num >= ndomains) {
1304                 spin_unlock_irqrestore(&iommu->lock, flags);
1305                 free_domain_mem(domain);
1306                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1307                 return NULL;
1308         }
1309
1310         set_bit(num, iommu->domain_ids);
1311         domain->id = num;
1312         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1313         set_bit(iommu->seq_id, &domain->iommu_bmp);
1314         domain->flags = 0;
1315         iommu->domains[num] = domain;
1316         spin_unlock_irqrestore(&iommu->lock, flags);
1317
1318         return domain;
1319 }
1320
1321 static void iommu_free_domain(struct dmar_domain *domain)
1322 {
1323         unsigned long flags;
1324         struct intel_iommu *iommu;
1325
1326         iommu = domain_get_iommu(domain);
1327
1328         spin_lock_irqsave(&iommu->lock, flags);
1329         clear_bit(domain->id, iommu->domain_ids);
1330         spin_unlock_irqrestore(&iommu->lock, flags);
1331 }
1332
1333 static struct iova_domain reserved_iova_list;
1334 static struct lock_class_key reserved_alloc_key;
1335 static struct lock_class_key reserved_rbtree_key;
1336
1337 static void dmar_init_reserved_ranges(void)
1338 {
1339         struct pci_dev *pdev = NULL;
1340         struct iova *iova;
1341         int i;
1342         u64 addr, size;
1343
1344         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1345
1346         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1347                 &reserved_alloc_key);
1348         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1349                 &reserved_rbtree_key);
1350
1351         /* IOAPIC ranges shouldn't be accessed by DMA */
1352         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1353                 IOVA_PFN(IOAPIC_RANGE_END));
1354         if (!iova)
1355                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1356
1357         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1358         for_each_pci_dev(pdev) {
1359                 struct resource *r;
1360
1361                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1362                         r = &pdev->resource[i];
1363                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1364                                 continue;
1365                         addr = r->start;
1366                         addr &= PAGE_MASK;
1367                         size = r->end - addr;
1368                         size = PAGE_ALIGN(size);
1369                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1370                                 IOVA_PFN(size + addr) - 1);
1371                         if (!iova)
1372                                 printk(KERN_ERR "Reserve iova failed\n");
1373                 }
1374         }
1375
1376 }
1377
1378 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1379 {
1380         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1381 }
1382
1383 static inline int guestwidth_to_adjustwidth(int gaw)
1384 {
1385         int agaw;
1386         int r = (gaw - 12) % 9;
1387
1388         if (r == 0)
1389                 agaw = gaw;
1390         else
1391                 agaw = gaw + 9 - r;
1392         if (agaw > 64)
1393                 agaw = 64;
1394         return agaw;
1395 }
1396
1397 static int domain_init(struct dmar_domain *domain, int guest_width)
1398 {
1399         struct intel_iommu *iommu;
1400         int adjust_width, agaw;
1401         unsigned long sagaw;
1402
1403         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1404         spin_lock_init(&domain->mapping_lock);
1405         spin_lock_init(&domain->iommu_lock);
1406
1407         domain_reserve_special_ranges(domain);
1408
1409         /* calculate AGAW */
1410         iommu = domain_get_iommu(domain);
1411         if (guest_width > cap_mgaw(iommu->cap))
1412                 guest_width = cap_mgaw(iommu->cap);
1413         domain->gaw = guest_width;
1414         adjust_width = guestwidth_to_adjustwidth(guest_width);
1415         agaw = width_to_agaw(adjust_width);
1416         sagaw = cap_sagaw(iommu->cap);
1417         if (!test_bit(agaw, &sagaw)) {
1418                 /* hardware doesn't support it, choose a bigger one */
1419                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1420                 agaw = find_next_bit(&sagaw, 5, agaw);
1421                 if (agaw >= 5)
1422                         return -ENODEV;
1423         }
1424         domain->agaw = agaw;
1425         INIT_LIST_HEAD(&domain->devices);
1426
1427         if (ecap_coherent(iommu->ecap))
1428                 domain->iommu_coherency = 1;
1429         else
1430                 domain->iommu_coherency = 0;
1431
1432         domain->iommu_count = 1;
1433
1434         /* always allocate the top pgd */
1435         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1436         if (!domain->pgd)
1437                 return -ENOMEM;
1438         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1439         return 0;
1440 }
1441
1442 static void domain_exit(struct dmar_domain *domain)
1443 {
1444         u64 end;
1445
1446         /* Domain 0 is reserved, so dont process it */
1447         if (!domain)
1448                 return;
1449
1450         domain_remove_dev_info(domain);
1451         /* destroy iovas */
1452         put_iova_domain(&domain->iovad);
1453         end = DOMAIN_MAX_ADDR(domain->gaw);
1454         end = end & (~PAGE_MASK);
1455
1456         /* clear ptes */
1457         dma_pte_clear_range(domain, 0, end);
1458
1459         /* free page tables */
1460         dma_pte_free_pagetable(domain, 0, end);
1461
1462         iommu_free_domain(domain);
1463         free_domain_mem(domain);
1464 }
1465
1466 static int domain_context_mapping_one(struct dmar_domain *domain,
1467                 u8 bus, u8 devfn)
1468 {
1469         struct context_entry *context;
1470         unsigned long flags;
1471         struct intel_iommu *iommu;
1472         struct dma_pte *pgd;
1473         unsigned long num;
1474         unsigned long ndomains;
1475         int id;
1476         int agaw;
1477
1478         pr_debug("Set context mapping for %02x:%02x.%d\n",
1479                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1480         BUG_ON(!domain->pgd);
1481
1482         iommu = device_to_iommu(bus, devfn);
1483         if (!iommu)
1484                 return -ENODEV;
1485
1486         context = device_to_context_entry(iommu, bus, devfn);
1487         if (!context)
1488                 return -ENOMEM;
1489         spin_lock_irqsave(&iommu->lock, flags);
1490         if (context_present(context)) {
1491                 spin_unlock_irqrestore(&iommu->lock, flags);
1492                 return 0;
1493         }
1494
1495         id = domain->id;
1496         pgd = domain->pgd;
1497
1498         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1499                 int found = 0;
1500
1501                 /* find an available domain id for this device in iommu */
1502                 ndomains = cap_ndoms(iommu->cap);
1503                 num = find_first_bit(iommu->domain_ids, ndomains);
1504                 for (; num < ndomains; ) {
1505                         if (iommu->domains[num] == domain) {
1506                                 id = num;
1507                                 found = 1;
1508                                 break;
1509                         }
1510                         num = find_next_bit(iommu->domain_ids,
1511                                             cap_ndoms(iommu->cap), num+1);
1512                 }
1513
1514                 if (found == 0) {
1515                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1516                         if (num >= ndomains) {
1517                                 spin_unlock_irqrestore(&iommu->lock, flags);
1518                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1519                                 return -EFAULT;
1520                         }
1521
1522                         set_bit(num, iommu->domain_ids);
1523                         iommu->domains[num] = domain;
1524                         id = num;
1525                 }
1526
1527                 /* Skip top levels of page tables for
1528                  * iommu which has less agaw than default.
1529                  */
1530                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1531                         pgd = phys_to_virt(dma_pte_addr(pgd));
1532                         if (!dma_pte_present(pgd)) {
1533                                 spin_unlock_irqrestore(&iommu->lock, flags);
1534                                 return -ENOMEM;
1535                         }
1536                 }
1537         }
1538
1539         context_set_domain_id(context, id);
1540         context_set_address_width(context, iommu->agaw);
1541         context_set_address_root(context, virt_to_phys(pgd));
1542         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1543         context_set_fault_enable(context);
1544         context_set_present(context);
1545         domain_flush_cache(domain, context, sizeof(*context));
1546
1547         /* it's a non-present to present mapping */
1548         if (iommu->flush.flush_context(iommu, domain->id,
1549                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1550                 DMA_CCMD_DEVICE_INVL, 1))
1551                 iommu_flush_write_buffer(iommu);
1552         else
1553                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1554
1555         spin_unlock_irqrestore(&iommu->lock, flags);
1556
1557         spin_lock_irqsave(&domain->iommu_lock, flags);
1558         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1559                 domain->iommu_count++;
1560                 domain_update_iommu_coherency(domain);
1561         }
1562         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1563         return 0;
1564 }
1565
1566 static int
1567 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1568 {
1569         int ret;
1570         struct pci_dev *tmp, *parent;
1571
1572         ret = domain_context_mapping_one(domain, pdev->bus->number,
1573                 pdev->devfn);
1574         if (ret)
1575                 return ret;
1576
1577         /* dependent device mapping */
1578         tmp = pci_find_upstream_pcie_bridge(pdev);
1579         if (!tmp)
1580                 return 0;
1581         /* Secondary interface's bus number and devfn 0 */
1582         parent = pdev->bus->self;
1583         while (parent != tmp) {
1584                 ret = domain_context_mapping_one(domain, parent->bus->number,
1585                         parent->devfn);
1586                 if (ret)
1587                         return ret;
1588                 parent = parent->bus->self;
1589         }
1590         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1591                 return domain_context_mapping_one(domain,
1592                         tmp->subordinate->number, 0);
1593         else /* this is a legacy PCI bridge */
1594                 return domain_context_mapping_one(domain,
1595                         tmp->bus->number, tmp->devfn);
1596 }
1597
1598 static int domain_context_mapped(struct pci_dev *pdev)
1599 {
1600         int ret;
1601         struct pci_dev *tmp, *parent;
1602         struct intel_iommu *iommu;
1603
1604         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1605         if (!iommu)
1606                 return -ENODEV;
1607
1608         ret = device_context_mapped(iommu,
1609                 pdev->bus->number, pdev->devfn);
1610         if (!ret)
1611                 return ret;
1612         /* dependent device mapping */
1613         tmp = pci_find_upstream_pcie_bridge(pdev);
1614         if (!tmp)
1615                 return ret;
1616         /* Secondary interface's bus number and devfn 0 */
1617         parent = pdev->bus->self;
1618         while (parent != tmp) {
1619                 ret = device_context_mapped(iommu, parent->bus->number,
1620                         parent->devfn);
1621                 if (!ret)
1622                         return ret;
1623                 parent = parent->bus->self;
1624         }
1625         if (tmp->is_pcie)
1626                 return device_context_mapped(iommu,
1627                         tmp->subordinate->number, 0);
1628         else
1629                 return device_context_mapped(iommu,
1630                         tmp->bus->number, tmp->devfn);
1631 }
1632
1633 static int
1634 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1635                         u64 hpa, size_t size, int prot)
1636 {
1637         u64 start_pfn, end_pfn;
1638         struct dma_pte *pte;
1639         int index;
1640         int addr_width = agaw_to_width(domain->agaw);
1641
1642         hpa &= (((u64)1) << addr_width) - 1;
1643
1644         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1645                 return -EINVAL;
1646         iova &= PAGE_MASK;
1647         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1648         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1649         index = 0;
1650         while (start_pfn < end_pfn) {
1651                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1652                 if (!pte)
1653                         return -ENOMEM;
1654                 /* We don't need lock here, nobody else
1655                  * touches the iova range
1656                  */
1657                 BUG_ON(dma_pte_addr(pte));
1658                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1659                 dma_set_pte_prot(pte, prot);
1660                 domain_flush_cache(domain, pte, sizeof(*pte));
1661                 start_pfn++;
1662                 index++;
1663         }
1664         return 0;
1665 }
1666
1667 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1668 {
1669         if (!iommu)
1670                 return;
1671
1672         clear_context_table(iommu, bus, devfn);
1673         iommu->flush.flush_context(iommu, 0, 0, 0,
1674                                            DMA_CCMD_GLOBAL_INVL, 0);
1675         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1676                                          DMA_TLB_GLOBAL_FLUSH, 0);
1677 }
1678
1679 static void domain_remove_dev_info(struct dmar_domain *domain)
1680 {
1681         struct device_domain_info *info;
1682         unsigned long flags;
1683         struct intel_iommu *iommu;
1684
1685         spin_lock_irqsave(&device_domain_lock, flags);
1686         while (!list_empty(&domain->devices)) {
1687                 info = list_entry(domain->devices.next,
1688                         struct device_domain_info, link);
1689                 list_del(&info->link);
1690                 list_del(&info->global);
1691                 if (info->dev)
1692                         info->dev->dev.archdata.iommu = NULL;
1693                 spin_unlock_irqrestore(&device_domain_lock, flags);
1694
1695                 iommu = device_to_iommu(info->bus, info->devfn);
1696                 iommu_detach_dev(iommu, info->bus, info->devfn);
1697                 free_devinfo_mem(info);
1698
1699                 spin_lock_irqsave(&device_domain_lock, flags);
1700         }
1701         spin_unlock_irqrestore(&device_domain_lock, flags);
1702 }
1703
1704 /*
1705  * find_domain
1706  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1707  */
1708 static struct dmar_domain *
1709 find_domain(struct pci_dev *pdev)
1710 {
1711         struct device_domain_info *info;
1712
1713         /* No lock here, assumes no domain exit in normal case */
1714         info = pdev->dev.archdata.iommu;
1715         if (info)
1716                 return info->domain;
1717         return NULL;
1718 }
1719
1720 /* domain is initialized */
1721 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1722 {
1723         struct dmar_domain *domain, *found = NULL;
1724         struct intel_iommu *iommu;
1725         struct dmar_drhd_unit *drhd;
1726         struct device_domain_info *info, *tmp;
1727         struct pci_dev *dev_tmp;
1728         unsigned long flags;
1729         int bus = 0, devfn = 0;
1730
1731         domain = find_domain(pdev);
1732         if (domain)
1733                 return domain;
1734
1735         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1736         if (dev_tmp) {
1737                 if (dev_tmp->is_pcie) {
1738                         bus = dev_tmp->subordinate->number;
1739                         devfn = 0;
1740                 } else {
1741                         bus = dev_tmp->bus->number;
1742                         devfn = dev_tmp->devfn;
1743                 }
1744                 spin_lock_irqsave(&device_domain_lock, flags);
1745                 list_for_each_entry(info, &device_domain_list, global) {
1746                         if (info->bus == bus && info->devfn == devfn) {
1747                                 found = info->domain;
1748                                 break;
1749                         }
1750                 }
1751                 spin_unlock_irqrestore(&device_domain_lock, flags);
1752                 /* pcie-pci bridge already has a domain, uses it */
1753                 if (found) {
1754                         domain = found;
1755                         goto found_domain;
1756                 }
1757         }
1758
1759         /* Allocate new domain for the device */
1760         drhd = dmar_find_matched_drhd_unit(pdev);
1761         if (!drhd) {
1762                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1763                         pci_name(pdev));
1764                 return NULL;
1765         }
1766         iommu = drhd->iommu;
1767
1768         domain = iommu_alloc_domain(iommu);
1769         if (!domain)
1770                 goto error;
1771
1772         if (domain_init(domain, gaw)) {
1773                 domain_exit(domain);
1774                 goto error;
1775         }
1776
1777         /* register pcie-to-pci device */
1778         if (dev_tmp) {
1779                 info = alloc_devinfo_mem();
1780                 if (!info) {
1781                         domain_exit(domain);
1782                         goto error;
1783                 }
1784                 info->bus = bus;
1785                 info->devfn = devfn;
1786                 info->dev = NULL;
1787                 info->domain = domain;
1788                 /* This domain is shared by devices under p2p bridge */
1789                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1790
1791                 /* pcie-to-pci bridge already has a domain, uses it */
1792                 found = NULL;
1793                 spin_lock_irqsave(&device_domain_lock, flags);
1794                 list_for_each_entry(tmp, &device_domain_list, global) {
1795                         if (tmp->bus == bus && tmp->devfn == devfn) {
1796                                 found = tmp->domain;
1797                                 break;
1798                         }
1799                 }
1800                 if (found) {
1801                         free_devinfo_mem(info);
1802                         domain_exit(domain);
1803                         domain = found;
1804                 } else {
1805                         list_add(&info->link, &domain->devices);
1806                         list_add(&info->global, &device_domain_list);
1807                 }
1808                 spin_unlock_irqrestore(&device_domain_lock, flags);
1809         }
1810
1811 found_domain:
1812         info = alloc_devinfo_mem();
1813         if (!info)
1814                 goto error;
1815         info->bus = pdev->bus->number;
1816         info->devfn = pdev->devfn;
1817         info->dev = pdev;
1818         info->domain = domain;
1819         spin_lock_irqsave(&device_domain_lock, flags);
1820         /* somebody is fast */
1821         found = find_domain(pdev);
1822         if (found != NULL) {
1823                 spin_unlock_irqrestore(&device_domain_lock, flags);
1824                 if (found != domain) {
1825                         domain_exit(domain);
1826                         domain = found;
1827                 }
1828                 free_devinfo_mem(info);
1829                 return domain;
1830         }
1831         list_add(&info->link, &domain->devices);
1832         list_add(&info->global, &device_domain_list);
1833         pdev->dev.archdata.iommu = info;
1834         spin_unlock_irqrestore(&device_domain_lock, flags);
1835         return domain;
1836 error:
1837         /* recheck it here, maybe others set it */
1838         return find_domain(pdev);
1839 }
1840
1841 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1842                                       unsigned long long start,
1843                                       unsigned long long end)
1844 {
1845         struct dmar_domain *domain;
1846         unsigned long size;
1847         unsigned long long base;
1848         int ret;
1849
1850         printk(KERN_INFO
1851                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1852                 pci_name(pdev), start, end);
1853         /* page table init */
1854         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1855         if (!domain)
1856                 return -ENOMEM;
1857
1858         /* The address might not be aligned */
1859         base = start & PAGE_MASK;
1860         size = end - base;
1861         size = PAGE_ALIGN(size);
1862         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1863                         IOVA_PFN(base + size) - 1)) {
1864                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1865                 ret = -ENOMEM;
1866                 goto error;
1867         }
1868
1869         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1870                 size, base, pci_name(pdev));
1871         /*
1872          * RMRR range might have overlap with physical memory range,
1873          * clear it first
1874          */
1875         dma_pte_clear_range(domain, base, base + size);
1876
1877         ret = domain_page_mapping(domain, base, base, size,
1878                 DMA_PTE_READ|DMA_PTE_WRITE);
1879         if (ret)
1880                 goto error;
1881
1882         /* context entry init */
1883         ret = domain_context_mapping(domain, pdev);
1884         if (!ret)
1885                 return 0;
1886 error:
1887         domain_exit(domain);
1888         return ret;
1889
1890 }
1891
1892 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1893         struct pci_dev *pdev)
1894 {
1895         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1896                 return 0;
1897         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1898                 rmrr->end_address + 1);
1899 }
1900
1901 #ifdef CONFIG_DMAR_GFX_WA
1902 struct iommu_prepare_data {
1903         struct pci_dev *pdev;
1904         int ret;
1905 };
1906
1907 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1908                                          unsigned long end_pfn, void *datax)
1909 {
1910         struct iommu_prepare_data *data;
1911
1912         data = (struct iommu_prepare_data *)datax;
1913
1914         data->ret = iommu_prepare_identity_map(data->pdev,
1915                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1916         return data->ret;
1917
1918 }
1919
1920 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1921 {
1922         int nid;
1923         struct iommu_prepare_data data;
1924
1925         data.pdev = pdev;
1926         data.ret = 0;
1927
1928         for_each_online_node(nid) {
1929                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1930                 if (data.ret)
1931                         return data.ret;
1932         }
1933         return data.ret;
1934 }
1935
1936 static void __init iommu_prepare_gfx_mapping(void)
1937 {
1938         struct pci_dev *pdev = NULL;
1939         int ret;
1940
1941         for_each_pci_dev(pdev) {
1942                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1943                                 !IS_GFX_DEVICE(pdev))
1944                         continue;
1945                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1946                         pci_name(pdev));
1947                 ret = iommu_prepare_with_active_regions(pdev);
1948                 if (ret)
1949                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1950         }
1951 }
1952 #else /* !CONFIG_DMAR_GFX_WA */
1953 static inline void iommu_prepare_gfx_mapping(void)
1954 {
1955         return;
1956 }
1957 #endif
1958
1959 #ifdef CONFIG_DMAR_FLOPPY_WA
1960 static inline void iommu_prepare_isa(void)
1961 {
1962         struct pci_dev *pdev;
1963         int ret;
1964
1965         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1966         if (!pdev)
1967                 return;
1968
1969         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1970         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1971
1972         if (ret)
1973                 printk("IOMMU: Failed to create 0-64M identity map, "
1974                         "floppy might not work\n");
1975
1976 }
1977 #else
1978 static inline void iommu_prepare_isa(void)
1979 {
1980         return;
1981 }
1982 #endif /* !CONFIG_DMAR_FLPY_WA */
1983
1984 static int __init init_dmars(void)
1985 {
1986         struct dmar_drhd_unit *drhd;
1987         struct dmar_rmrr_unit *rmrr;
1988         struct pci_dev *pdev;
1989         struct intel_iommu *iommu;
1990         int i, ret, unit = 0;
1991
1992         /*
1993          * for each drhd
1994          *    allocate root
1995          *    initialize and program root entry to not present
1996          * endfor
1997          */
1998         for_each_drhd_unit(drhd) {
1999                 g_num_of_iommus++;
2000                 /*
2001                  * lock not needed as this is only incremented in the single
2002                  * threaded kernel __init code path all other access are read
2003                  * only
2004                  */
2005         }
2006
2007         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2008                         GFP_KERNEL);
2009         if (!g_iommus) {
2010                 printk(KERN_ERR "Allocating global iommu array failed\n");
2011                 ret = -ENOMEM;
2012                 goto error;
2013         }
2014
2015         deferred_flush = kzalloc(g_num_of_iommus *
2016                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2017         if (!deferred_flush) {
2018                 kfree(g_iommus);
2019                 ret = -ENOMEM;
2020                 goto error;
2021         }
2022
2023         for_each_drhd_unit(drhd) {
2024                 if (drhd->ignored)
2025                         continue;
2026
2027                 iommu = drhd->iommu;
2028                 g_iommus[iommu->seq_id] = iommu;
2029
2030                 ret = iommu_init_domains(iommu);
2031                 if (ret)
2032                         goto error;
2033
2034                 /*
2035                  * TBD:
2036                  * we could share the same root & context tables
2037                  * amoung all IOMMU's. Need to Split it later.
2038                  */
2039                 ret = iommu_alloc_root_entry(iommu);
2040                 if (ret) {
2041                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2042                         goto error;
2043                 }
2044         }
2045
2046         for_each_drhd_unit(drhd) {
2047                 if (drhd->ignored)
2048                         continue;
2049
2050                 iommu = drhd->iommu;
2051                 if (dmar_enable_qi(iommu)) {
2052                         /*
2053                          * Queued Invalidate not enabled, use Register Based
2054                          * Invalidate
2055                          */
2056                         iommu->flush.flush_context = __iommu_flush_context;
2057                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2058                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2059                                "invalidation\n",
2060                                (unsigned long long)drhd->reg_base_addr);
2061                 } else {
2062                         iommu->flush.flush_context = qi_flush_context;
2063                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2064                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2065                                "invalidation\n",
2066                                (unsigned long long)drhd->reg_base_addr);
2067                 }
2068         }
2069
2070         /*
2071          * For each rmrr
2072          *   for each dev attached to rmrr
2073          *   do
2074          *     locate drhd for dev, alloc domain for dev
2075          *     allocate free domain
2076          *     allocate page table entries for rmrr
2077          *     if context not allocated for bus
2078          *           allocate and init context
2079          *           set present in root table for this bus
2080          *     init context with domain, translation etc
2081          *    endfor
2082          * endfor
2083          */
2084         for_each_rmrr_units(rmrr) {
2085                 for (i = 0; i < rmrr->devices_cnt; i++) {
2086                         pdev = rmrr->devices[i];
2087                         /* some BIOS lists non-exist devices in DMAR table */
2088                         if (!pdev)
2089                                 continue;
2090                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2091                         if (ret)
2092                                 printk(KERN_ERR
2093                                  "IOMMU: mapping reserved region failed\n");
2094                 }
2095         }
2096
2097         iommu_prepare_gfx_mapping();
2098
2099         iommu_prepare_isa();
2100
2101         /*
2102          * for each drhd
2103          *   enable fault log
2104          *   global invalidate context cache
2105          *   global invalidate iotlb
2106          *   enable translation
2107          */
2108         for_each_drhd_unit(drhd) {
2109                 if (drhd->ignored)
2110                         continue;
2111                 iommu = drhd->iommu;
2112                 sprintf (iommu->name, "dmar%d", unit++);
2113
2114                 iommu_flush_write_buffer(iommu);
2115
2116                 ret = dmar_set_interrupt(iommu);
2117                 if (ret)
2118                         goto error;
2119
2120                 iommu_set_root_entry(iommu);
2121
2122                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2123                                            0);
2124                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2125                                          0);
2126                 iommu_disable_protect_mem_regions(iommu);
2127
2128                 ret = iommu_enable_translation(iommu);
2129                 if (ret)
2130                         goto error;
2131         }
2132
2133         return 0;
2134 error:
2135         for_each_drhd_unit(drhd) {
2136                 if (drhd->ignored)
2137                         continue;
2138                 iommu = drhd->iommu;
2139                 free_iommu(iommu);
2140         }
2141         kfree(g_iommus);
2142         return ret;
2143 }
2144
2145 static inline u64 aligned_size(u64 host_addr, size_t size)
2146 {
2147         u64 addr;
2148         addr = (host_addr & (~PAGE_MASK)) + size;
2149         return PAGE_ALIGN(addr);
2150 }
2151
2152 struct iova *
2153 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2154 {
2155         struct iova *piova;
2156
2157         /* Make sure it's in range */
2158         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2159         if (!size || (IOVA_START_ADDR + size > end))
2160                 return NULL;
2161
2162         piova = alloc_iova(&domain->iovad,
2163                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2164         return piova;
2165 }
2166
2167 static struct iova *
2168 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2169                    size_t size, u64 dma_mask)
2170 {
2171         struct pci_dev *pdev = to_pci_dev(dev);
2172         struct iova *iova = NULL;
2173
2174         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2175                 iova = iommu_alloc_iova(domain, size, dma_mask);
2176         else {
2177                 /*
2178                  * First try to allocate an io virtual address in
2179                  * DMA_32BIT_MASK and if that fails then try allocating
2180                  * from higher range
2181                  */
2182                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2183                 if (!iova)
2184                         iova = iommu_alloc_iova(domain, size, dma_mask);
2185         }
2186
2187         if (!iova) {
2188                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2189                 return NULL;
2190         }
2191
2192         return iova;
2193 }
2194
2195 static struct dmar_domain *
2196 get_valid_domain_for_dev(struct pci_dev *pdev)
2197 {
2198         struct dmar_domain *domain;
2199         int ret;
2200
2201         domain = get_domain_for_dev(pdev,
2202                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2203         if (!domain) {
2204                 printk(KERN_ERR
2205                         "Allocating domain for %s failed", pci_name(pdev));
2206                 return NULL;
2207         }
2208
2209         /* make sure context mapping is ok */
2210         if (unlikely(!domain_context_mapped(pdev))) {
2211                 ret = domain_context_mapping(domain, pdev);
2212                 if (ret) {
2213                         printk(KERN_ERR
2214                                 "Domain context map for %s failed",
2215                                 pci_name(pdev));
2216                         return NULL;
2217                 }
2218         }
2219
2220         return domain;
2221 }
2222
2223 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2224                                      size_t size, int dir, u64 dma_mask)
2225 {
2226         struct pci_dev *pdev = to_pci_dev(hwdev);
2227         struct dmar_domain *domain;
2228         phys_addr_t start_paddr;
2229         struct iova *iova;
2230         int prot = 0;
2231         int ret;
2232         struct intel_iommu *iommu;
2233
2234         BUG_ON(dir == DMA_NONE);
2235         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2236                 return paddr;
2237
2238         domain = get_valid_domain_for_dev(pdev);
2239         if (!domain)
2240                 return 0;
2241
2242         iommu = domain_get_iommu(domain);
2243         size = aligned_size((u64)paddr, size);
2244
2245         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2246         if (!iova)
2247                 goto error;
2248
2249         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2250
2251         /*
2252          * Check if DMAR supports zero-length reads on write only
2253          * mappings..
2254          */
2255         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2256                         !cap_zlr(iommu->cap))
2257                 prot |= DMA_PTE_READ;
2258         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2259                 prot |= DMA_PTE_WRITE;
2260         /*
2261          * paddr - (paddr + size) might be partial page, we should map the whole
2262          * page.  Note: if two part of one page are separately mapped, we
2263          * might have two guest_addr mapping to the same host paddr, but this
2264          * is not a big problem
2265          */
2266         ret = domain_page_mapping(domain, start_paddr,
2267                 ((u64)paddr) & PAGE_MASK, size, prot);
2268         if (ret)
2269                 goto error;
2270
2271         /* it's a non-present to present mapping */
2272         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2273                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2274         if (ret)
2275                 iommu_flush_write_buffer(iommu);
2276
2277         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2278
2279 error:
2280         if (iova)
2281                 __free_iova(&domain->iovad, iova);
2282         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2283                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2284         return 0;
2285 }
2286
2287 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2288                                  unsigned long offset, size_t size,
2289                                  enum dma_data_direction dir,
2290                                  struct dma_attrs *attrs)
2291 {
2292         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2293                                   dir, to_pci_dev(dev)->dma_mask);
2294 }
2295
2296 static void flush_unmaps(void)
2297 {
2298         int i, j;
2299
2300         timer_on = 0;
2301
2302         /* just flush them all */
2303         for (i = 0; i < g_num_of_iommus; i++) {
2304                 struct intel_iommu *iommu = g_iommus[i];
2305                 if (!iommu)
2306                         continue;
2307
2308                 if (deferred_flush[i].next) {
2309                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2310                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2311                         for (j = 0; j < deferred_flush[i].next; j++) {
2312                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2313                                                 deferred_flush[i].iova[j]);
2314                         }
2315                         deferred_flush[i].next = 0;
2316                 }
2317         }
2318
2319         list_size = 0;
2320 }
2321
2322 static void flush_unmaps_timeout(unsigned long data)
2323 {
2324         unsigned long flags;
2325
2326         spin_lock_irqsave(&async_umap_flush_lock, flags);
2327         flush_unmaps();
2328         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2329 }
2330
2331 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2332 {
2333         unsigned long flags;
2334         int next, iommu_id;
2335         struct intel_iommu *iommu;
2336
2337         spin_lock_irqsave(&async_umap_flush_lock, flags);
2338         if (list_size == HIGH_WATER_MARK)
2339                 flush_unmaps();
2340
2341         iommu = domain_get_iommu(dom);
2342         iommu_id = iommu->seq_id;
2343
2344         next = deferred_flush[iommu_id].next;
2345         deferred_flush[iommu_id].domain[next] = dom;
2346         deferred_flush[iommu_id].iova[next] = iova;
2347         deferred_flush[iommu_id].next++;
2348
2349         if (!timer_on) {
2350                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2351                 timer_on = 1;
2352         }
2353         list_size++;
2354         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2355 }
2356
2357 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2358                              size_t size, enum dma_data_direction dir,
2359                              struct dma_attrs *attrs)
2360 {
2361         struct pci_dev *pdev = to_pci_dev(dev);
2362         struct dmar_domain *domain;
2363         unsigned long start_addr;
2364         struct iova *iova;
2365         struct intel_iommu *iommu;
2366
2367         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2368                 return;
2369         domain = find_domain(pdev);
2370         BUG_ON(!domain);
2371
2372         iommu = domain_get_iommu(domain);
2373
2374         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2375         if (!iova)
2376                 return;
2377
2378         start_addr = iova->pfn_lo << PAGE_SHIFT;
2379         size = aligned_size((u64)dev_addr, size);
2380
2381         pr_debug("Device %s unmapping: %lx@%llx\n",
2382                 pci_name(pdev), size, (unsigned long long)start_addr);
2383
2384         /*  clear the whole page */
2385         dma_pte_clear_range(domain, start_addr, start_addr + size);
2386         /* free page tables */
2387         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2388         if (intel_iommu_strict) {
2389                 if (iommu_flush_iotlb_psi(iommu,
2390                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2391                         iommu_flush_write_buffer(iommu);
2392                 /* free iova */
2393                 __free_iova(&domain->iovad, iova);
2394         } else {
2395                 add_unmap(domain, iova);
2396                 /*
2397                  * queue up the release of the unmap to save the 1/6th of the
2398                  * cpu used up by the iotlb flush operation...
2399                  */
2400         }
2401 }
2402
2403 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2404                                int dir)
2405 {
2406         intel_unmap_page(dev, dev_addr, size, dir, NULL);
2407 }
2408
2409 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2410                                   dma_addr_t *dma_handle, gfp_t flags)
2411 {
2412         void *vaddr;
2413         int order;
2414
2415         size = PAGE_ALIGN(size);
2416         order = get_order(size);
2417         flags &= ~(GFP_DMA | GFP_DMA32);
2418
2419         vaddr = (void *)__get_free_pages(flags, order);
2420         if (!vaddr)
2421                 return NULL;
2422         memset(vaddr, 0, size);
2423
2424         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2425                                          DMA_BIDIRECTIONAL,
2426                                          hwdev->coherent_dma_mask);
2427         if (*dma_handle)
2428                 return vaddr;
2429         free_pages((unsigned long)vaddr, order);
2430         return NULL;
2431 }
2432
2433 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2434                                 dma_addr_t dma_handle)
2435 {
2436         int order;
2437
2438         size = PAGE_ALIGN(size);
2439         order = get_order(size);
2440
2441         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2442         free_pages((unsigned long)vaddr, order);
2443 }
2444
2445 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2446
2447 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2448                            int nelems, enum dma_data_direction dir,
2449                            struct dma_attrs *attrs)
2450 {
2451         int i;
2452         struct pci_dev *pdev = to_pci_dev(hwdev);
2453         struct dmar_domain *domain;
2454         unsigned long start_addr;
2455         struct iova *iova;
2456         size_t size = 0;
2457         void *addr;
2458         struct scatterlist *sg;
2459         struct intel_iommu *iommu;
2460
2461         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2462                 return;
2463
2464         domain = find_domain(pdev);
2465         BUG_ON(!domain);
2466
2467         iommu = domain_get_iommu(domain);
2468
2469         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2470         if (!iova)
2471                 return;
2472         for_each_sg(sglist, sg, nelems, i) {
2473                 addr = SG_ENT_VIRT_ADDRESS(sg);
2474                 size += aligned_size((u64)addr, sg->length);
2475         }
2476
2477         start_addr = iova->pfn_lo << PAGE_SHIFT;
2478
2479         /*  clear the whole page */
2480         dma_pte_clear_range(domain, start_addr, start_addr + size);
2481         /* free page tables */
2482         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2483
2484         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2485                         size >> VTD_PAGE_SHIFT, 0))
2486                 iommu_flush_write_buffer(iommu);
2487
2488         /* free iova */
2489         __free_iova(&domain->iovad, iova);
2490 }
2491
2492 static int intel_nontranslate_map_sg(struct device *hddev,
2493         struct scatterlist *sglist, int nelems, int dir)
2494 {
2495         int i;
2496         struct scatterlist *sg;
2497
2498         for_each_sg(sglist, sg, nelems, i) {
2499                 BUG_ON(!sg_page(sg));
2500                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2501                 sg->dma_length = sg->length;
2502         }
2503         return nelems;
2504 }
2505
2506 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2507                         enum dma_data_direction dir, struct dma_attrs *attrs)
2508 {
2509         void *addr;
2510         int i;
2511         struct pci_dev *pdev = to_pci_dev(hwdev);
2512         struct dmar_domain *domain;
2513         size_t size = 0;
2514         int prot = 0;
2515         size_t offset = 0;
2516         struct iova *iova = NULL;
2517         int ret;
2518         struct scatterlist *sg;
2519         unsigned long start_addr;
2520         struct intel_iommu *iommu;
2521
2522         BUG_ON(dir == DMA_NONE);
2523         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2524                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2525
2526         domain = get_valid_domain_for_dev(pdev);
2527         if (!domain)
2528                 return 0;
2529
2530         iommu = domain_get_iommu(domain);
2531
2532         for_each_sg(sglist, sg, nelems, i) {
2533                 addr = SG_ENT_VIRT_ADDRESS(sg);
2534                 addr = (void *)virt_to_phys(addr);
2535                 size += aligned_size((u64)addr, sg->length);
2536         }
2537
2538         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2539         if (!iova) {
2540                 sglist->dma_length = 0;
2541                 return 0;
2542         }
2543
2544         /*
2545          * Check if DMAR supports zero-length reads on write only
2546          * mappings..
2547          */
2548         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2549                         !cap_zlr(iommu->cap))
2550                 prot |= DMA_PTE_READ;
2551         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2552                 prot |= DMA_PTE_WRITE;
2553
2554         start_addr = iova->pfn_lo << PAGE_SHIFT;
2555         offset = 0;
2556         for_each_sg(sglist, sg, nelems, i) {
2557                 addr = SG_ENT_VIRT_ADDRESS(sg);
2558                 addr = (void *)virt_to_phys(addr);
2559                 size = aligned_size((u64)addr, sg->length);
2560                 ret = domain_page_mapping(domain, start_addr + offset,
2561                         ((u64)addr) & PAGE_MASK,
2562                         size, prot);
2563                 if (ret) {
2564                         /*  clear the page */
2565                         dma_pte_clear_range(domain, start_addr,
2566                                   start_addr + offset);
2567                         /* free page tables */
2568                         dma_pte_free_pagetable(domain, start_addr,
2569                                   start_addr + offset);
2570                         /* free iova */
2571                         __free_iova(&domain->iovad, iova);
2572                         return 0;
2573                 }
2574                 sg->dma_address = start_addr + offset +
2575                                 ((u64)addr & (~PAGE_MASK));
2576                 sg->dma_length = sg->length;
2577                 offset += size;
2578         }
2579
2580         /* it's a non-present to present mapping */
2581         if (iommu_flush_iotlb_psi(iommu, domain->id,
2582                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2583                 iommu_flush_write_buffer(iommu);
2584         return nelems;
2585 }
2586
2587 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2588 {
2589         return !dma_addr;
2590 }
2591
2592 struct dma_map_ops intel_dma_ops = {
2593         .alloc_coherent = intel_alloc_coherent,
2594         .free_coherent = intel_free_coherent,
2595         .map_sg = intel_map_sg,
2596         .unmap_sg = intel_unmap_sg,
2597         .map_page = intel_map_page,
2598         .unmap_page = intel_unmap_page,
2599         .mapping_error = intel_mapping_error,
2600 };
2601
2602 static inline int iommu_domain_cache_init(void)
2603 {
2604         int ret = 0;
2605
2606         iommu_domain_cache = kmem_cache_create("iommu_domain",
2607                                          sizeof(struct dmar_domain),
2608                                          0,
2609                                          SLAB_HWCACHE_ALIGN,
2610
2611                                          NULL);
2612         if (!iommu_domain_cache) {
2613                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2614                 ret = -ENOMEM;
2615         }
2616
2617         return ret;
2618 }
2619
2620 static inline int iommu_devinfo_cache_init(void)
2621 {
2622         int ret = 0;
2623
2624         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2625                                          sizeof(struct device_domain_info),
2626                                          0,
2627                                          SLAB_HWCACHE_ALIGN,
2628                                          NULL);
2629         if (!iommu_devinfo_cache) {
2630                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2631                 ret = -ENOMEM;
2632         }
2633
2634         return ret;
2635 }
2636
2637 static inline int iommu_iova_cache_init(void)
2638 {
2639         int ret = 0;
2640
2641         iommu_iova_cache = kmem_cache_create("iommu_iova",
2642                                          sizeof(struct iova),
2643                                          0,
2644                                          SLAB_HWCACHE_ALIGN,
2645                                          NULL);
2646         if (!iommu_iova_cache) {
2647                 printk(KERN_ERR "Couldn't create iova cache\n");
2648                 ret = -ENOMEM;
2649         }
2650
2651         return ret;
2652 }
2653
2654 static int __init iommu_init_mempool(void)
2655 {
2656         int ret;
2657         ret = iommu_iova_cache_init();
2658         if (ret)
2659                 return ret;
2660
2661         ret = iommu_domain_cache_init();
2662         if (ret)
2663                 goto domain_error;
2664
2665         ret = iommu_devinfo_cache_init();
2666         if (!ret)
2667                 return ret;
2668
2669         kmem_cache_destroy(iommu_domain_cache);
2670 domain_error:
2671         kmem_cache_destroy(iommu_iova_cache);
2672
2673         return -ENOMEM;
2674 }
2675
2676 static void __init iommu_exit_mempool(void)
2677 {
2678         kmem_cache_destroy(iommu_devinfo_cache);
2679         kmem_cache_destroy(iommu_domain_cache);
2680         kmem_cache_destroy(iommu_iova_cache);
2681
2682 }
2683
2684 static void __init init_no_remapping_devices(void)
2685 {
2686         struct dmar_drhd_unit *drhd;
2687
2688         for_each_drhd_unit(drhd) {
2689                 if (!drhd->include_all) {
2690                         int i;
2691                         for (i = 0; i < drhd->devices_cnt; i++)
2692                                 if (drhd->devices[i] != NULL)
2693                                         break;
2694                         /* ignore DMAR unit if no pci devices exist */
2695                         if (i == drhd->devices_cnt)
2696                                 drhd->ignored = 1;
2697                 }
2698         }
2699
2700         if (dmar_map_gfx)
2701                 return;
2702
2703         for_each_drhd_unit(drhd) {
2704                 int i;
2705                 if (drhd->ignored || drhd->include_all)
2706                         continue;
2707
2708                 for (i = 0; i < drhd->devices_cnt; i++)
2709                         if (drhd->devices[i] &&
2710                                 !IS_GFX_DEVICE(drhd->devices[i]))
2711                                 break;
2712
2713                 if (i < drhd->devices_cnt)
2714                         continue;
2715
2716                 /* bypass IOMMU if it is just for gfx devices */
2717                 drhd->ignored = 1;
2718                 for (i = 0; i < drhd->devices_cnt; i++) {
2719                         if (!drhd->devices[i])
2720                                 continue;
2721                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2722                 }
2723         }
2724 }
2725
2726 int __init intel_iommu_init(void)
2727 {
2728         int ret = 0;
2729
2730         if (dmar_table_init())
2731                 return  -ENODEV;
2732
2733         if (dmar_dev_scope_init())
2734                 return  -ENODEV;
2735
2736         /*
2737          * Check the need for DMA-remapping initialization now.
2738          * Above initialization will also be used by Interrupt-remapping.
2739          */
2740         if (no_iommu || swiotlb || dmar_disabled)
2741                 return -ENODEV;
2742
2743         iommu_init_mempool();
2744         dmar_init_reserved_ranges();
2745
2746         init_no_remapping_devices();
2747
2748         ret = init_dmars();
2749         if (ret) {
2750                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2751                 put_iova_domain(&reserved_iova_list);
2752                 iommu_exit_mempool();
2753                 return ret;
2754         }
2755         printk(KERN_INFO
2756         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2757
2758         init_timer(&unmap_timer);
2759         force_iommu = 1;
2760         dma_ops = &intel_dma_ops;
2761
2762         register_iommu(&intel_iommu_ops);
2763
2764         return 0;
2765 }
2766
2767 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2768                                   struct pci_dev *pdev)
2769 {
2770         struct device_domain_info *info;
2771         unsigned long flags;
2772
2773         info = alloc_devinfo_mem();
2774         if (!info)
2775                 return -ENOMEM;
2776
2777         info->bus = pdev->bus->number;
2778         info->devfn = pdev->devfn;
2779         info->dev = pdev;
2780         info->domain = domain;
2781
2782         spin_lock_irqsave(&device_domain_lock, flags);
2783         list_add(&info->link, &domain->devices);
2784         list_add(&info->global, &device_domain_list);
2785         pdev->dev.archdata.iommu = info;
2786         spin_unlock_irqrestore(&device_domain_lock, flags);
2787
2788         return 0;
2789 }
2790
2791 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2792                                           struct pci_dev *pdev)
2793 {
2794         struct device_domain_info *info;
2795         struct intel_iommu *iommu;
2796         unsigned long flags;
2797         int found = 0;
2798         struct list_head *entry, *tmp;
2799
2800         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2801         if (!iommu)
2802                 return;
2803
2804         spin_lock_irqsave(&device_domain_lock, flags);
2805         list_for_each_safe(entry, tmp, &domain->devices) {
2806                 info = list_entry(entry, struct device_domain_info, link);
2807                 if (info->bus == pdev->bus->number &&
2808                     info->devfn == pdev->devfn) {
2809                         list_del(&info->link);
2810                         list_del(&info->global);
2811                         if (info->dev)
2812                                 info->dev->dev.archdata.iommu = NULL;
2813                         spin_unlock_irqrestore(&device_domain_lock, flags);
2814
2815                         iommu_detach_dev(iommu, info->bus, info->devfn);
2816                         free_devinfo_mem(info);
2817
2818                         spin_lock_irqsave(&device_domain_lock, flags);
2819
2820                         if (found)
2821                                 break;
2822                         else
2823                                 continue;
2824                 }
2825
2826                 /* if there is no other devices under the same iommu
2827                  * owned by this domain, clear this iommu in iommu_bmp
2828                  * update iommu count and coherency
2829                  */
2830                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2831                         found = 1;
2832         }
2833
2834         if (found == 0) {
2835                 unsigned long tmp_flags;
2836                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2837                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2838                 domain->iommu_count--;
2839                 domain_update_iommu_coherency(domain);
2840                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2841         }
2842
2843         spin_unlock_irqrestore(&device_domain_lock, flags);
2844 }
2845
2846 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2847 {
2848         struct device_domain_info *info;
2849         struct intel_iommu *iommu;
2850         unsigned long flags1, flags2;
2851
2852         spin_lock_irqsave(&device_domain_lock, flags1);
2853         while (!list_empty(&domain->devices)) {
2854                 info = list_entry(domain->devices.next,
2855                         struct device_domain_info, link);
2856                 list_del(&info->link);
2857                 list_del(&info->global);
2858                 if (info->dev)
2859                         info->dev->dev.archdata.iommu = NULL;
2860
2861                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2862
2863                 iommu = device_to_iommu(info->bus, info->devfn);
2864                 iommu_detach_dev(iommu, info->bus, info->devfn);
2865
2866                 /* clear this iommu in iommu_bmp, update iommu count
2867                  * and coherency
2868                  */
2869                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2870                 if (test_and_clear_bit(iommu->seq_id,
2871                                        &domain->iommu_bmp)) {
2872                         domain->iommu_count--;
2873                         domain_update_iommu_coherency(domain);
2874                 }
2875                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2876
2877                 free_devinfo_mem(info);
2878                 spin_lock_irqsave(&device_domain_lock, flags1);
2879         }
2880         spin_unlock_irqrestore(&device_domain_lock, flags1);
2881 }
2882
2883 /* domain id for virtual machine, it won't be set in context */
2884 static unsigned long vm_domid;
2885
2886 static int vm_domain_min_agaw(struct dmar_domain *domain)
2887 {
2888         int i;
2889         int min_agaw = domain->agaw;
2890
2891         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2892         for (; i < g_num_of_iommus; ) {
2893                 if (min_agaw > g_iommus[i]->agaw)
2894                         min_agaw = g_iommus[i]->agaw;
2895
2896                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2897         }
2898
2899         return min_agaw;
2900 }
2901
2902 static struct dmar_domain *iommu_alloc_vm_domain(void)
2903 {
2904         struct dmar_domain *domain;
2905
2906         domain = alloc_domain_mem();
2907         if (!domain)
2908                 return NULL;
2909
2910         domain->id = vm_domid++;
2911         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2912         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2913
2914         return domain;
2915 }
2916
2917 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2918 {
2919         int adjust_width;
2920
2921         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2922         spin_lock_init(&domain->mapping_lock);
2923         spin_lock_init(&domain->iommu_lock);
2924
2925         domain_reserve_special_ranges(domain);
2926
2927         /* calculate AGAW */
2928         domain->gaw = guest_width;
2929         adjust_width = guestwidth_to_adjustwidth(guest_width);
2930         domain->agaw = width_to_agaw(adjust_width);
2931
2932         INIT_LIST_HEAD(&domain->devices);
2933
2934         domain->iommu_count = 0;
2935         domain->iommu_coherency = 0;
2936         domain->max_addr = 0;
2937
2938         /* always allocate the top pgd */
2939         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2940         if (!domain->pgd)
2941                 return -ENOMEM;
2942         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2943         return 0;
2944 }
2945
2946 static void iommu_free_vm_domain(struct dmar_domain *domain)
2947 {
2948         unsigned long flags;
2949         struct dmar_drhd_unit *drhd;
2950         struct intel_iommu *iommu;
2951         unsigned long i;
2952         unsigned long ndomains;
2953
2954         for_each_drhd_unit(drhd) {
2955                 if (drhd->ignored)
2956                         continue;
2957                 iommu = drhd->iommu;
2958
2959                 ndomains = cap_ndoms(iommu->cap);
2960                 i = find_first_bit(iommu->domain_ids, ndomains);
2961                 for (; i < ndomains; ) {
2962                         if (iommu->domains[i] == domain) {
2963                                 spin_lock_irqsave(&iommu->lock, flags);
2964                                 clear_bit(i, iommu->domain_ids);
2965                                 iommu->domains[i] = NULL;
2966                                 spin_unlock_irqrestore(&iommu->lock, flags);
2967                                 break;
2968                         }
2969                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2970                 }
2971         }
2972 }
2973
2974 static void vm_domain_exit(struct dmar_domain *domain)
2975 {
2976         u64 end;
2977
2978         /* Domain 0 is reserved, so dont process it */
2979         if (!domain)
2980                 return;
2981
2982         vm_domain_remove_all_dev_info(domain);
2983         /* destroy iovas */
2984         put_iova_domain(&domain->iovad);
2985         end = DOMAIN_MAX_ADDR(domain->gaw);
2986         end = end & (~VTD_PAGE_MASK);
2987
2988         /* clear ptes */
2989         dma_pte_clear_range(domain, 0, end);
2990
2991         /* free page tables */
2992         dma_pte_free_pagetable(domain, 0, end);
2993
2994         iommu_free_vm_domain(domain);
2995         free_domain_mem(domain);
2996 }
2997
2998 static int intel_iommu_domain_init(struct iommu_domain *domain)
2999 {
3000         struct dmar_domain *dmar_domain;
3001
3002         dmar_domain = iommu_alloc_vm_domain();
3003         if (!dmar_domain) {
3004                 printk(KERN_ERR
3005                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3006                 return -ENOMEM;
3007         }
3008         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3009                 printk(KERN_ERR
3010                         "intel_iommu_domain_init() failed\n");
3011                 vm_domain_exit(dmar_domain);
3012                 return -ENOMEM;
3013         }
3014         domain->priv = dmar_domain;
3015
3016         return 0;
3017 }
3018
3019 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3020 {
3021         struct dmar_domain *dmar_domain = domain->priv;
3022
3023         domain->priv = NULL;
3024         vm_domain_exit(dmar_domain);
3025 }
3026
3027 static int intel_iommu_attach_device(struct iommu_domain *domain,
3028                                      struct device *dev)
3029 {
3030         struct dmar_domain *dmar_domain = domain->priv;
3031         struct pci_dev *pdev = to_pci_dev(dev);
3032         struct intel_iommu *iommu;
3033         int addr_width;
3034         u64 end;
3035         int ret;
3036
3037         /* normally pdev is not mapped */
3038         if (unlikely(domain_context_mapped(pdev))) {
3039                 struct dmar_domain *old_domain;
3040
3041                 old_domain = find_domain(pdev);
3042                 if (old_domain) {
3043                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3044                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3045                         else
3046                                 domain_remove_dev_info(old_domain);
3047                 }
3048         }
3049
3050         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3051         if (!iommu)
3052                 return -ENODEV;
3053
3054         /* check if this iommu agaw is sufficient for max mapped address */
3055         addr_width = agaw_to_width(iommu->agaw);
3056         end = DOMAIN_MAX_ADDR(addr_width);
3057         end = end & VTD_PAGE_MASK;
3058         if (end < dmar_domain->max_addr) {
3059                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3060                        "sufficient for the mapped address (%llx)\n",
3061                        __func__, iommu->agaw, dmar_domain->max_addr);
3062                 return -EFAULT;
3063         }
3064
3065         ret = domain_context_mapping(dmar_domain, pdev);
3066         if (ret)
3067                 return ret;
3068
3069         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3070         return ret;
3071 }
3072
3073 static void intel_iommu_detach_device(struct iommu_domain *domain,
3074                                       struct device *dev)
3075 {
3076         struct dmar_domain *dmar_domain = domain->priv;
3077         struct pci_dev *pdev = to_pci_dev(dev);
3078
3079         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3080 }
3081
3082 static int intel_iommu_map_range(struct iommu_domain *domain,
3083                                  unsigned long iova, phys_addr_t hpa,
3084                                  size_t size, int iommu_prot)
3085 {
3086         struct dmar_domain *dmar_domain = domain->priv;
3087         u64 max_addr;
3088         int addr_width;
3089         int prot = 0;
3090         int ret;
3091
3092         if (iommu_prot & IOMMU_READ)
3093                 prot |= DMA_PTE_READ;
3094         if (iommu_prot & IOMMU_WRITE)
3095                 prot |= DMA_PTE_WRITE;
3096
3097         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3098         if (dmar_domain->max_addr < max_addr) {
3099                 int min_agaw;
3100                 u64 end;
3101
3102                 /* check if minimum agaw is sufficient for mapped address */
3103                 min_agaw = vm_domain_min_agaw(dmar_domain);
3104                 addr_width = agaw_to_width(min_agaw);
3105                 end = DOMAIN_MAX_ADDR(addr_width);
3106                 end = end & VTD_PAGE_MASK;
3107                 if (end < max_addr) {
3108                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3109                                "sufficient for the mapped address (%llx)\n",
3110                                __func__, min_agaw, max_addr);
3111                         return -EFAULT;
3112                 }
3113                 dmar_domain->max_addr = max_addr;
3114         }
3115
3116         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3117         return ret;
3118 }
3119
3120 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3121                                     unsigned long iova, size_t size)
3122 {
3123         struct dmar_domain *dmar_domain = domain->priv;
3124         dma_addr_t base;
3125
3126         /* The address might not be aligned */
3127         base = iova & VTD_PAGE_MASK;
3128         size = VTD_PAGE_ALIGN(size);
3129         dma_pte_clear_range(dmar_domain, base, base + size);
3130
3131         if (dmar_domain->max_addr == base + size)
3132                 dmar_domain->max_addr = base;
3133 }
3134
3135 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3136                                             unsigned long iova)
3137 {
3138         struct dmar_domain *dmar_domain = domain->priv;
3139         struct dma_pte *pte;
3140         u64 phys = 0;
3141
3142         pte = addr_to_dma_pte(dmar_domain, iova);
3143         if (pte)
3144                 phys = dma_pte_addr(pte);
3145
3146         return phys;
3147 }
3148
3149 static struct iommu_ops intel_iommu_ops = {
3150         .domain_init    = intel_iommu_domain_init,
3151         .domain_destroy = intel_iommu_domain_destroy,
3152         .attach_dev     = intel_iommu_attach_device,
3153         .detach_dev     = intel_iommu_detach_device,
3154         .map            = intel_iommu_map_range,
3155         .unmap          = intel_iommu_unmap_range,
3156         .iova_to_phys   = intel_iommu_iova_to_phys,
3157 };
3158
3159 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3160 {
3161         /*
3162          * Mobile 4 Series Chipset neglects to set RWBF capability,
3163          * but needs it:
3164          */
3165         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3166         rwbf_quirk = 1;
3167 }
3168
3169 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);