]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - drivers/pci/intel-iommu.c
Initialize domain flags to 0
[linux-2.6-omap-h63xx.git] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /*
61  * 0: Present
62  * 1-11: Reserved
63  * 12-63: Context Ptr (12 - (haw-1))
64  * 64-127: Reserved
65  */
66 struct root_entry {
67         u64     val;
68         u64     rsvd1;
69 };
70 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71 static inline bool root_present(struct root_entry *root)
72 {
73         return (root->val & 1);
74 }
75 static inline void set_root_present(struct root_entry *root)
76 {
77         root->val |= 1;
78 }
79 static inline void set_root_value(struct root_entry *root, unsigned long value)
80 {
81         root->val |= value & VTD_PAGE_MASK;
82 }
83
84 static inline struct context_entry *
85 get_context_addr_from_root(struct root_entry *root)
86 {
87         return (struct context_entry *)
88                 (root_present(root)?phys_to_virt(
89                 root->val & VTD_PAGE_MASK) :
90                 NULL);
91 }
92
93 /*
94  * low 64 bits:
95  * 0: present
96  * 1: fault processing disable
97  * 2-3: translation type
98  * 12-63: address space root
99  * high 64 bits:
100  * 0-2: address width
101  * 3-6: aval
102  * 8-23: domain id
103  */
104 struct context_entry {
105         u64 lo;
106         u64 hi;
107 };
108
109 static inline bool context_present(struct context_entry *context)
110 {
111         return (context->lo & 1);
112 }
113 static inline void context_set_present(struct context_entry *context)
114 {
115         context->lo |= 1;
116 }
117
118 static inline void context_set_fault_enable(struct context_entry *context)
119 {
120         context->lo &= (((u64)-1) << 2) | 1;
121 }
122
123 #define CONTEXT_TT_MULTI_LEVEL 0
124
125 static inline void context_set_translation_type(struct context_entry *context,
126                                                 unsigned long value)
127 {
128         context->lo &= (((u64)-1) << 4) | 3;
129         context->lo |= (value & 3) << 2;
130 }
131
132 static inline void context_set_address_root(struct context_entry *context,
133                                             unsigned long value)
134 {
135         context->lo |= value & VTD_PAGE_MASK;
136 }
137
138 static inline void context_set_address_width(struct context_entry *context,
139                                              unsigned long value)
140 {
141         context->hi |= value & 7;
142 }
143
144 static inline void context_set_domain_id(struct context_entry *context,
145                                          unsigned long value)
146 {
147         context->hi |= (value & ((1 << 16) - 1)) << 8;
148 }
149
150 static inline void context_clear_entry(struct context_entry *context)
151 {
152         context->lo = 0;
153         context->hi = 0;
154 }
155
156 /*
157  * 0: readable
158  * 1: writable
159  * 2-6: reserved
160  * 7: super page
161  * 8-11: available
162  * 12-63: Host physcial address
163  */
164 struct dma_pte {
165         u64 val;
166 };
167
168 static inline void dma_clear_pte(struct dma_pte *pte)
169 {
170         pte->val = 0;
171 }
172
173 static inline void dma_set_pte_readable(struct dma_pte *pte)
174 {
175         pte->val |= DMA_PTE_READ;
176 }
177
178 static inline void dma_set_pte_writable(struct dma_pte *pte)
179 {
180         pte->val |= DMA_PTE_WRITE;
181 }
182
183 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
184 {
185         pte->val = (pte->val & ~3) | (prot & 3);
186 }
187
188 static inline u64 dma_pte_addr(struct dma_pte *pte)
189 {
190         return (pte->val & VTD_PAGE_MASK);
191 }
192
193 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
194 {
195         pte->val |= (addr & VTD_PAGE_MASK);
196 }
197
198 static inline bool dma_pte_present(struct dma_pte *pte)
199 {
200         return (pte->val & 3) != 0;
201 }
202
203 struct dmar_domain {
204         int     id;                     /* domain id */
205         struct intel_iommu *iommu;      /* back pointer to owning iommu */
206
207         struct list_head devices;       /* all devices' list */
208         struct iova_domain iovad;       /* iova's that belong to this domain */
209
210         struct dma_pte  *pgd;           /* virtual address */
211         spinlock_t      mapping_lock;   /* page table lock */
212         int             gaw;            /* max guest address width */
213
214         /* adjusted guest address width, 0 is level 2 30-bit */
215         int             agaw;
216
217 #define DOMAIN_FLAG_MULTIPLE_DEVICES 1
218         int             flags;
219 };
220
221 /* PCI domain-device relationship */
222 struct device_domain_info {
223         struct list_head link;  /* link to domain siblings */
224         struct list_head global; /* link to global list */
225         u8 bus;                 /* PCI bus numer */
226         u8 devfn;               /* PCI devfn number */
227         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
228         struct dmar_domain *domain; /* pointer to domain */
229 };
230
231 static void flush_unmaps_timeout(unsigned long data);
232
233 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
234
235 #define HIGH_WATER_MARK 250
236 struct deferred_flush_tables {
237         int next;
238         struct iova *iova[HIGH_WATER_MARK];
239         struct dmar_domain *domain[HIGH_WATER_MARK];
240 };
241
242 static struct deferred_flush_tables *deferred_flush;
243
244 /* bitmap for indexing intel_iommus */
245 static int g_num_of_iommus;
246
247 static DEFINE_SPINLOCK(async_umap_flush_lock);
248 static LIST_HEAD(unmaps_to_do);
249
250 static int timer_on;
251 static long list_size;
252
253 static void domain_remove_dev_info(struct dmar_domain *domain);
254
255 int dmar_disabled;
256 static int __initdata dmar_map_gfx = 1;
257 static int dmar_forcedac;
258 static int intel_iommu_strict;
259
260 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
261 static DEFINE_SPINLOCK(device_domain_lock);
262 static LIST_HEAD(device_domain_list);
263
264 static int __init intel_iommu_setup(char *str)
265 {
266         if (!str)
267                 return -EINVAL;
268         while (*str) {
269                 if (!strncmp(str, "off", 3)) {
270                         dmar_disabled = 1;
271                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
272                 } else if (!strncmp(str, "igfx_off", 8)) {
273                         dmar_map_gfx = 0;
274                         printk(KERN_INFO
275                                 "Intel-IOMMU: disable GFX device mapping\n");
276                 } else if (!strncmp(str, "forcedac", 8)) {
277                         printk(KERN_INFO
278                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
279                         dmar_forcedac = 1;
280                 } else if (!strncmp(str, "strict", 6)) {
281                         printk(KERN_INFO
282                                 "Intel-IOMMU: disable batched IOTLB flush\n");
283                         intel_iommu_strict = 1;
284                 }
285
286                 str += strcspn(str, ",");
287                 while (*str == ',')
288                         str++;
289         }
290         return 0;
291 }
292 __setup("intel_iommu=", intel_iommu_setup);
293
294 static struct kmem_cache *iommu_domain_cache;
295 static struct kmem_cache *iommu_devinfo_cache;
296 static struct kmem_cache *iommu_iova_cache;
297
298 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
299 {
300         unsigned int flags;
301         void *vaddr;
302
303         /* trying to avoid low memory issues */
304         flags = current->flags & PF_MEMALLOC;
305         current->flags |= PF_MEMALLOC;
306         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
307         current->flags &= (~PF_MEMALLOC | flags);
308         return vaddr;
309 }
310
311
312 static inline void *alloc_pgtable_page(void)
313 {
314         unsigned int flags;
315         void *vaddr;
316
317         /* trying to avoid low memory issues */
318         flags = current->flags & PF_MEMALLOC;
319         current->flags |= PF_MEMALLOC;
320         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
321         current->flags &= (~PF_MEMALLOC | flags);
322         return vaddr;
323 }
324
325 static inline void free_pgtable_page(void *vaddr)
326 {
327         free_page((unsigned long)vaddr);
328 }
329
330 static inline void *alloc_domain_mem(void)
331 {
332         return iommu_kmem_cache_alloc(iommu_domain_cache);
333 }
334
335 static void free_domain_mem(void *vaddr)
336 {
337         kmem_cache_free(iommu_domain_cache, vaddr);
338 }
339
340 static inline void * alloc_devinfo_mem(void)
341 {
342         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
343 }
344
345 static inline void free_devinfo_mem(void *vaddr)
346 {
347         kmem_cache_free(iommu_devinfo_cache, vaddr);
348 }
349
350 struct iova *alloc_iova_mem(void)
351 {
352         return iommu_kmem_cache_alloc(iommu_iova_cache);
353 }
354
355 void free_iova_mem(struct iova *iova)
356 {
357         kmem_cache_free(iommu_iova_cache, iova);
358 }
359
360 /* Gets context entry for a given bus and devfn */
361 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
362                 u8 bus, u8 devfn)
363 {
364         struct root_entry *root;
365         struct context_entry *context;
366         unsigned long phy_addr;
367         unsigned long flags;
368
369         spin_lock_irqsave(&iommu->lock, flags);
370         root = &iommu->root_entry[bus];
371         context = get_context_addr_from_root(root);
372         if (!context) {
373                 context = (struct context_entry *)alloc_pgtable_page();
374                 if (!context) {
375                         spin_unlock_irqrestore(&iommu->lock, flags);
376                         return NULL;
377                 }
378                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
379                 phy_addr = virt_to_phys((void *)context);
380                 set_root_value(root, phy_addr);
381                 set_root_present(root);
382                 __iommu_flush_cache(iommu, root, sizeof(*root));
383         }
384         spin_unlock_irqrestore(&iommu->lock, flags);
385         return &context[devfn];
386 }
387
388 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
389 {
390         struct root_entry *root;
391         struct context_entry *context;
392         int ret;
393         unsigned long flags;
394
395         spin_lock_irqsave(&iommu->lock, flags);
396         root = &iommu->root_entry[bus];
397         context = get_context_addr_from_root(root);
398         if (!context) {
399                 ret = 0;
400                 goto out;
401         }
402         ret = context_present(&context[devfn]);
403 out:
404         spin_unlock_irqrestore(&iommu->lock, flags);
405         return ret;
406 }
407
408 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
409 {
410         struct root_entry *root;
411         struct context_entry *context;
412         unsigned long flags;
413
414         spin_lock_irqsave(&iommu->lock, flags);
415         root = &iommu->root_entry[bus];
416         context = get_context_addr_from_root(root);
417         if (context) {
418                 context_clear_entry(&context[devfn]);
419                 __iommu_flush_cache(iommu, &context[devfn], \
420                         sizeof(*context));
421         }
422         spin_unlock_irqrestore(&iommu->lock, flags);
423 }
424
425 static void free_context_table(struct intel_iommu *iommu)
426 {
427         struct root_entry *root;
428         int i;
429         unsigned long flags;
430         struct context_entry *context;
431
432         spin_lock_irqsave(&iommu->lock, flags);
433         if (!iommu->root_entry) {
434                 goto out;
435         }
436         for (i = 0; i < ROOT_ENTRY_NR; i++) {
437                 root = &iommu->root_entry[i];
438                 context = get_context_addr_from_root(root);
439                 if (context)
440                         free_pgtable_page(context);
441         }
442         free_pgtable_page(iommu->root_entry);
443         iommu->root_entry = NULL;
444 out:
445         spin_unlock_irqrestore(&iommu->lock, flags);
446 }
447
448 /* page table handling */
449 #define LEVEL_STRIDE            (9)
450 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
451
452 static inline int agaw_to_level(int agaw)
453 {
454         return agaw + 2;
455 }
456
457 static inline int agaw_to_width(int agaw)
458 {
459         return 30 + agaw * LEVEL_STRIDE;
460
461 }
462
463 static inline int width_to_agaw(int width)
464 {
465         return (width - 30) / LEVEL_STRIDE;
466 }
467
468 static inline unsigned int level_to_offset_bits(int level)
469 {
470         return (12 + (level - 1) * LEVEL_STRIDE);
471 }
472
473 static inline int address_level_offset(u64 addr, int level)
474 {
475         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
476 }
477
478 static inline u64 level_mask(int level)
479 {
480         return ((u64)-1 << level_to_offset_bits(level));
481 }
482
483 static inline u64 level_size(int level)
484 {
485         return ((u64)1 << level_to_offset_bits(level));
486 }
487
488 static inline u64 align_to_level(u64 addr, int level)
489 {
490         return ((addr + level_size(level) - 1) & level_mask(level));
491 }
492
493 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
494 {
495         int addr_width = agaw_to_width(domain->agaw);
496         struct dma_pte *parent, *pte = NULL;
497         int level = agaw_to_level(domain->agaw);
498         int offset;
499         unsigned long flags;
500
501         BUG_ON(!domain->pgd);
502
503         addr &= (((u64)1) << addr_width) - 1;
504         parent = domain->pgd;
505
506         spin_lock_irqsave(&domain->mapping_lock, flags);
507         while (level > 0) {
508                 void *tmp_page;
509
510                 offset = address_level_offset(addr, level);
511                 pte = &parent[offset];
512                 if (level == 1)
513                         break;
514
515                 if (!dma_pte_present(pte)) {
516                         tmp_page = alloc_pgtable_page();
517
518                         if (!tmp_page) {
519                                 spin_unlock_irqrestore(&domain->mapping_lock,
520                                         flags);
521                                 return NULL;
522                         }
523                         __iommu_flush_cache(domain->iommu, tmp_page,
524                                         PAGE_SIZE);
525                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
526                         /*
527                          * high level table always sets r/w, last level page
528                          * table control read/write
529                          */
530                         dma_set_pte_readable(pte);
531                         dma_set_pte_writable(pte);
532                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
533                 }
534                 parent = phys_to_virt(dma_pte_addr(pte));
535                 level--;
536         }
537
538         spin_unlock_irqrestore(&domain->mapping_lock, flags);
539         return pte;
540 }
541
542 /* return address's pte at specific level */
543 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
544                 int level)
545 {
546         struct dma_pte *parent, *pte = NULL;
547         int total = agaw_to_level(domain->agaw);
548         int offset;
549
550         parent = domain->pgd;
551         while (level <= total) {
552                 offset = address_level_offset(addr, total);
553                 pte = &parent[offset];
554                 if (level == total)
555                         return pte;
556
557                 if (!dma_pte_present(pte))
558                         break;
559                 parent = phys_to_virt(dma_pte_addr(pte));
560                 total--;
561         }
562         return NULL;
563 }
564
565 /* clear one page's page table */
566 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
567 {
568         struct dma_pte *pte = NULL;
569
570         /* get last level pte */
571         pte = dma_addr_level_pte(domain, addr, 1);
572
573         if (pte) {
574                 dma_clear_pte(pte);
575                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
576         }
577 }
578
579 /* clear last level pte, a tlb flush should be followed */
580 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
581 {
582         int addr_width = agaw_to_width(domain->agaw);
583
584         start &= (((u64)1) << addr_width) - 1;
585         end &= (((u64)1) << addr_width) - 1;
586         /* in case it's partial page */
587         start = PAGE_ALIGN(start);
588         end &= PAGE_MASK;
589
590         /* we don't need lock here, nobody else touches the iova range */
591         while (start < end) {
592                 dma_pte_clear_one(domain, start);
593                 start += VTD_PAGE_SIZE;
594         }
595 }
596
597 /* free page table pages. last level pte should already be cleared */
598 static void dma_pte_free_pagetable(struct dmar_domain *domain,
599         u64 start, u64 end)
600 {
601         int addr_width = agaw_to_width(domain->agaw);
602         struct dma_pte *pte;
603         int total = agaw_to_level(domain->agaw);
604         int level;
605         u64 tmp;
606
607         start &= (((u64)1) << addr_width) - 1;
608         end &= (((u64)1) << addr_width) - 1;
609
610         /* we don't need lock here, nobody else touches the iova range */
611         level = 2;
612         while (level <= total) {
613                 tmp = align_to_level(start, level);
614                 if (tmp >= end || (tmp + level_size(level) > end))
615                         return;
616
617                 while (tmp < end) {
618                         pte = dma_addr_level_pte(domain, tmp, level);
619                         if (pte) {
620                                 free_pgtable_page(
621                                         phys_to_virt(dma_pte_addr(pte)));
622                                 dma_clear_pte(pte);
623                                 __iommu_flush_cache(domain->iommu,
624                                                 pte, sizeof(*pte));
625                         }
626                         tmp += level_size(level);
627                 }
628                 level++;
629         }
630         /* free pgd */
631         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
632                 free_pgtable_page(domain->pgd);
633                 domain->pgd = NULL;
634         }
635 }
636
637 /* iommu handling */
638 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
639 {
640         struct root_entry *root;
641         unsigned long flags;
642
643         root = (struct root_entry *)alloc_pgtable_page();
644         if (!root)
645                 return -ENOMEM;
646
647         __iommu_flush_cache(iommu, root, ROOT_SIZE);
648
649         spin_lock_irqsave(&iommu->lock, flags);
650         iommu->root_entry = root;
651         spin_unlock_irqrestore(&iommu->lock, flags);
652
653         return 0;
654 }
655
656 static void iommu_set_root_entry(struct intel_iommu *iommu)
657 {
658         void *addr;
659         u32 cmd, sts;
660         unsigned long flag;
661
662         addr = iommu->root_entry;
663
664         spin_lock_irqsave(&iommu->register_lock, flag);
665         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
666
667         cmd = iommu->gcmd | DMA_GCMD_SRTP;
668         writel(cmd, iommu->reg + DMAR_GCMD_REG);
669
670         /* Make sure hardware complete it */
671         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
672                 readl, (sts & DMA_GSTS_RTPS), sts);
673
674         spin_unlock_irqrestore(&iommu->register_lock, flag);
675 }
676
677 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
678 {
679         u32 val;
680         unsigned long flag;
681
682         if (!cap_rwbf(iommu->cap))
683                 return;
684         val = iommu->gcmd | DMA_GCMD_WBF;
685
686         spin_lock_irqsave(&iommu->register_lock, flag);
687         writel(val, iommu->reg + DMAR_GCMD_REG);
688
689         /* Make sure hardware complete it */
690         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
691                         readl, (!(val & DMA_GSTS_WBFS)), val);
692
693         spin_unlock_irqrestore(&iommu->register_lock, flag);
694 }
695
696 /* return value determine if we need a write buffer flush */
697 static int __iommu_flush_context(struct intel_iommu *iommu,
698         u16 did, u16 source_id, u8 function_mask, u64 type,
699         int non_present_entry_flush)
700 {
701         u64 val = 0;
702         unsigned long flag;
703
704         /*
705          * In the non-present entry flush case, if hardware doesn't cache
706          * non-present entry we do nothing and if hardware cache non-present
707          * entry, we flush entries of domain 0 (the domain id is used to cache
708          * any non-present entries)
709          */
710         if (non_present_entry_flush) {
711                 if (!cap_caching_mode(iommu->cap))
712                         return 1;
713                 else
714                         did = 0;
715         }
716
717         switch (type) {
718         case DMA_CCMD_GLOBAL_INVL:
719                 val = DMA_CCMD_GLOBAL_INVL;
720                 break;
721         case DMA_CCMD_DOMAIN_INVL:
722                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
723                 break;
724         case DMA_CCMD_DEVICE_INVL:
725                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
726                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
727                 break;
728         default:
729                 BUG();
730         }
731         val |= DMA_CCMD_ICC;
732
733         spin_lock_irqsave(&iommu->register_lock, flag);
734         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
735
736         /* Make sure hardware complete it */
737         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
738                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
739
740         spin_unlock_irqrestore(&iommu->register_lock, flag);
741
742         /* flush context entry will implicitly flush write buffer */
743         return 0;
744 }
745
746 /* return value determine if we need a write buffer flush */
747 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
748         u64 addr, unsigned int size_order, u64 type,
749         int non_present_entry_flush)
750 {
751         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
752         u64 val = 0, val_iva = 0;
753         unsigned long flag;
754
755         /*
756          * In the non-present entry flush case, if hardware doesn't cache
757          * non-present entry we do nothing and if hardware cache non-present
758          * entry, we flush entries of domain 0 (the domain id is used to cache
759          * any non-present entries)
760          */
761         if (non_present_entry_flush) {
762                 if (!cap_caching_mode(iommu->cap))
763                         return 1;
764                 else
765                         did = 0;
766         }
767
768         switch (type) {
769         case DMA_TLB_GLOBAL_FLUSH:
770                 /* global flush doesn't need set IVA_REG */
771                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
772                 break;
773         case DMA_TLB_DSI_FLUSH:
774                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
775                 break;
776         case DMA_TLB_PSI_FLUSH:
777                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
778                 /* Note: always flush non-leaf currently */
779                 val_iva = size_order | addr;
780                 break;
781         default:
782                 BUG();
783         }
784         /* Note: set drain read/write */
785 #if 0
786         /*
787          * This is probably to be super secure.. Looks like we can
788          * ignore it without any impact.
789          */
790         if (cap_read_drain(iommu->cap))
791                 val |= DMA_TLB_READ_DRAIN;
792 #endif
793         if (cap_write_drain(iommu->cap))
794                 val |= DMA_TLB_WRITE_DRAIN;
795
796         spin_lock_irqsave(&iommu->register_lock, flag);
797         /* Note: Only uses first TLB reg currently */
798         if (val_iva)
799                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
800         dmar_writeq(iommu->reg + tlb_offset + 8, val);
801
802         /* Make sure hardware complete it */
803         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
804                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
805
806         spin_unlock_irqrestore(&iommu->register_lock, flag);
807
808         /* check IOTLB invalidation granularity */
809         if (DMA_TLB_IAIG(val) == 0)
810                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
811         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
812                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
813                         (unsigned long long)DMA_TLB_IIRG(type),
814                         (unsigned long long)DMA_TLB_IAIG(val));
815         /* flush iotlb entry will implicitly flush write buffer */
816         return 0;
817 }
818
819 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
820         u64 addr, unsigned int pages, int non_present_entry_flush)
821 {
822         unsigned int mask;
823
824         BUG_ON(addr & (~VTD_PAGE_MASK));
825         BUG_ON(pages == 0);
826
827         /* Fallback to domain selective flush if no PSI support */
828         if (!cap_pgsel_inv(iommu->cap))
829                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
830                                                 DMA_TLB_DSI_FLUSH,
831                                                 non_present_entry_flush);
832
833         /*
834          * PSI requires page size to be 2 ^ x, and the base address is naturally
835          * aligned to the size
836          */
837         mask = ilog2(__roundup_pow_of_two(pages));
838         /* Fallback to domain selective flush if size is too big */
839         if (mask > cap_max_amask_val(iommu->cap))
840                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
841                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
842
843         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
844                                         DMA_TLB_PSI_FLUSH,
845                                         non_present_entry_flush);
846 }
847
848 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
849 {
850         u32 pmen;
851         unsigned long flags;
852
853         spin_lock_irqsave(&iommu->register_lock, flags);
854         pmen = readl(iommu->reg + DMAR_PMEN_REG);
855         pmen &= ~DMA_PMEN_EPM;
856         writel(pmen, iommu->reg + DMAR_PMEN_REG);
857
858         /* wait for the protected region status bit to clear */
859         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
860                 readl, !(pmen & DMA_PMEN_PRS), pmen);
861
862         spin_unlock_irqrestore(&iommu->register_lock, flags);
863 }
864
865 static int iommu_enable_translation(struct intel_iommu *iommu)
866 {
867         u32 sts;
868         unsigned long flags;
869
870         spin_lock_irqsave(&iommu->register_lock, flags);
871         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
872
873         /* Make sure hardware complete it */
874         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
875                 readl, (sts & DMA_GSTS_TES), sts);
876
877         iommu->gcmd |= DMA_GCMD_TE;
878         spin_unlock_irqrestore(&iommu->register_lock, flags);
879         return 0;
880 }
881
882 static int iommu_disable_translation(struct intel_iommu *iommu)
883 {
884         u32 sts;
885         unsigned long flag;
886
887         spin_lock_irqsave(&iommu->register_lock, flag);
888         iommu->gcmd &= ~DMA_GCMD_TE;
889         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
890
891         /* Make sure hardware complete it */
892         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
893                 readl, (!(sts & DMA_GSTS_TES)), sts);
894
895         spin_unlock_irqrestore(&iommu->register_lock, flag);
896         return 0;
897 }
898
899 /* iommu interrupt handling. Most stuff are MSI-like. */
900
901 static const char *fault_reason_strings[] =
902 {
903         "Software",
904         "Present bit in root entry is clear",
905         "Present bit in context entry is clear",
906         "Invalid context entry",
907         "Access beyond MGAW",
908         "PTE Write access is not set",
909         "PTE Read access is not set",
910         "Next page table ptr is invalid",
911         "Root table address invalid",
912         "Context table ptr is invalid",
913         "non-zero reserved fields in RTP",
914         "non-zero reserved fields in CTP",
915         "non-zero reserved fields in PTE",
916 };
917 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
918
919 const char *dmar_get_fault_reason(u8 fault_reason)
920 {
921         if (fault_reason > MAX_FAULT_REASON_IDX)
922                 return "Unknown";
923         else
924                 return fault_reason_strings[fault_reason];
925 }
926
927 void dmar_msi_unmask(unsigned int irq)
928 {
929         struct intel_iommu *iommu = get_irq_data(irq);
930         unsigned long flag;
931
932         /* unmask it */
933         spin_lock_irqsave(&iommu->register_lock, flag);
934         writel(0, iommu->reg + DMAR_FECTL_REG);
935         /* Read a reg to force flush the post write */
936         readl(iommu->reg + DMAR_FECTL_REG);
937         spin_unlock_irqrestore(&iommu->register_lock, flag);
938 }
939
940 void dmar_msi_mask(unsigned int irq)
941 {
942         unsigned long flag;
943         struct intel_iommu *iommu = get_irq_data(irq);
944
945         /* mask it */
946         spin_lock_irqsave(&iommu->register_lock, flag);
947         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
948         /* Read a reg to force flush the post write */
949         readl(iommu->reg + DMAR_FECTL_REG);
950         spin_unlock_irqrestore(&iommu->register_lock, flag);
951 }
952
953 void dmar_msi_write(int irq, struct msi_msg *msg)
954 {
955         struct intel_iommu *iommu = get_irq_data(irq);
956         unsigned long flag;
957
958         spin_lock_irqsave(&iommu->register_lock, flag);
959         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
960         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
961         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
962         spin_unlock_irqrestore(&iommu->register_lock, flag);
963 }
964
965 void dmar_msi_read(int irq, struct msi_msg *msg)
966 {
967         struct intel_iommu *iommu = get_irq_data(irq);
968         unsigned long flag;
969
970         spin_lock_irqsave(&iommu->register_lock, flag);
971         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
972         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
973         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
974         spin_unlock_irqrestore(&iommu->register_lock, flag);
975 }
976
977 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
978                 u8 fault_reason, u16 source_id, unsigned long long addr)
979 {
980         const char *reason;
981
982         reason = dmar_get_fault_reason(fault_reason);
983
984         printk(KERN_ERR
985                 "DMAR:[%s] Request device [%02x:%02x.%d] "
986                 "fault addr %llx \n"
987                 "DMAR:[fault reason %02d] %s\n",
988                 (type ? "DMA Read" : "DMA Write"),
989                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
990                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
991         return 0;
992 }
993
994 #define PRIMARY_FAULT_REG_LEN (16)
995 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
996 {
997         struct intel_iommu *iommu = dev_id;
998         int reg, fault_index;
999         u32 fault_status;
1000         unsigned long flag;
1001
1002         spin_lock_irqsave(&iommu->register_lock, flag);
1003         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1004
1005         /* TBD: ignore advanced fault log currently */
1006         if (!(fault_status & DMA_FSTS_PPF))
1007                 goto clear_overflow;
1008
1009         fault_index = dma_fsts_fault_record_index(fault_status);
1010         reg = cap_fault_reg_offset(iommu->cap);
1011         while (1) {
1012                 u8 fault_reason;
1013                 u16 source_id;
1014                 u64 guest_addr;
1015                 int type;
1016                 u32 data;
1017
1018                 /* highest 32 bits */
1019                 data = readl(iommu->reg + reg +
1020                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1021                 if (!(data & DMA_FRCD_F))
1022                         break;
1023
1024                 fault_reason = dma_frcd_fault_reason(data);
1025                 type = dma_frcd_type(data);
1026
1027                 data = readl(iommu->reg + reg +
1028                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1029                 source_id = dma_frcd_source_id(data);
1030
1031                 guest_addr = dmar_readq(iommu->reg + reg +
1032                                 fault_index * PRIMARY_FAULT_REG_LEN);
1033                 guest_addr = dma_frcd_page_addr(guest_addr);
1034                 /* clear the fault */
1035                 writel(DMA_FRCD_F, iommu->reg + reg +
1036                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1037
1038                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1039
1040                 iommu_page_fault_do_one(iommu, type, fault_reason,
1041                                 source_id, guest_addr);
1042
1043                 fault_index++;
1044                 if (fault_index > cap_num_fault_regs(iommu->cap))
1045                         fault_index = 0;
1046                 spin_lock_irqsave(&iommu->register_lock, flag);
1047         }
1048 clear_overflow:
1049         /* clear primary fault overflow */
1050         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1051         if (fault_status & DMA_FSTS_PFO)
1052                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1053
1054         spin_unlock_irqrestore(&iommu->register_lock, flag);
1055         return IRQ_HANDLED;
1056 }
1057
1058 int dmar_set_interrupt(struct intel_iommu *iommu)
1059 {
1060         int irq, ret;
1061
1062         irq = create_irq();
1063         if (!irq) {
1064                 printk(KERN_ERR "IOMMU: no free vectors\n");
1065                 return -EINVAL;
1066         }
1067
1068         set_irq_data(irq, iommu);
1069         iommu->irq = irq;
1070
1071         ret = arch_setup_dmar_msi(irq);
1072         if (ret) {
1073                 set_irq_data(irq, NULL);
1074                 iommu->irq = 0;
1075                 destroy_irq(irq);
1076                 return 0;
1077         }
1078
1079         /* Force fault register is cleared */
1080         iommu_page_fault(irq, iommu);
1081
1082         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1083         if (ret)
1084                 printk(KERN_ERR "IOMMU: can't request irq\n");
1085         return ret;
1086 }
1087
1088 static int iommu_init_domains(struct intel_iommu *iommu)
1089 {
1090         unsigned long ndomains;
1091         unsigned long nlongs;
1092
1093         ndomains = cap_ndoms(iommu->cap);
1094         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1095         nlongs = BITS_TO_LONGS(ndomains);
1096
1097         /* TBD: there might be 64K domains,
1098          * consider other allocation for future chip
1099          */
1100         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1101         if (!iommu->domain_ids) {
1102                 printk(KERN_ERR "Allocating domain id array failed\n");
1103                 return -ENOMEM;
1104         }
1105         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1106                         GFP_KERNEL);
1107         if (!iommu->domains) {
1108                 printk(KERN_ERR "Allocating domain array failed\n");
1109                 kfree(iommu->domain_ids);
1110                 return -ENOMEM;
1111         }
1112
1113         spin_lock_init(&iommu->lock);
1114
1115         /*
1116          * if Caching mode is set, then invalid translations are tagged
1117          * with domainid 0. Hence we need to pre-allocate it.
1118          */
1119         if (cap_caching_mode(iommu->cap))
1120                 set_bit(0, iommu->domain_ids);
1121         return 0;
1122 }
1123
1124
1125 static void domain_exit(struct dmar_domain *domain);
1126
1127 void free_dmar_iommu(struct intel_iommu *iommu)
1128 {
1129         struct dmar_domain *domain;
1130         int i;
1131
1132         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1133         for (; i < cap_ndoms(iommu->cap); ) {
1134                 domain = iommu->domains[i];
1135                 clear_bit(i, iommu->domain_ids);
1136                 domain_exit(domain);
1137                 i = find_next_bit(iommu->domain_ids,
1138                         cap_ndoms(iommu->cap), i+1);
1139         }
1140
1141         if (iommu->gcmd & DMA_GCMD_TE)
1142                 iommu_disable_translation(iommu);
1143
1144         if (iommu->irq) {
1145                 set_irq_data(iommu->irq, NULL);
1146                 /* This will mask the irq */
1147                 free_irq(iommu->irq, iommu);
1148                 destroy_irq(iommu->irq);
1149         }
1150
1151         kfree(iommu->domains);
1152         kfree(iommu->domain_ids);
1153
1154         /* free context mapping */
1155         free_context_table(iommu);
1156 }
1157
1158 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1159 {
1160         unsigned long num;
1161         unsigned long ndomains;
1162         struct dmar_domain *domain;
1163         unsigned long flags;
1164
1165         domain = alloc_domain_mem();
1166         if (!domain)
1167                 return NULL;
1168
1169         ndomains = cap_ndoms(iommu->cap);
1170
1171         spin_lock_irqsave(&iommu->lock, flags);
1172         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1173         if (num >= ndomains) {
1174                 spin_unlock_irqrestore(&iommu->lock, flags);
1175                 free_domain_mem(domain);
1176                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1177                 return NULL;
1178         }
1179
1180         set_bit(num, iommu->domain_ids);
1181         domain->id = num;
1182         domain->iommu = iommu;
1183         domain->flags = 0;
1184         iommu->domains[num] = domain;
1185         spin_unlock_irqrestore(&iommu->lock, flags);
1186
1187         return domain;
1188 }
1189
1190 static void iommu_free_domain(struct dmar_domain *domain)
1191 {
1192         unsigned long flags;
1193
1194         spin_lock_irqsave(&domain->iommu->lock, flags);
1195         clear_bit(domain->id, domain->iommu->domain_ids);
1196         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1197 }
1198
1199 static struct iova_domain reserved_iova_list;
1200 static struct lock_class_key reserved_alloc_key;
1201 static struct lock_class_key reserved_rbtree_key;
1202
1203 static void dmar_init_reserved_ranges(void)
1204 {
1205         struct pci_dev *pdev = NULL;
1206         struct iova *iova;
1207         int i;
1208         u64 addr, size;
1209
1210         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1211
1212         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1213                 &reserved_alloc_key);
1214         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1215                 &reserved_rbtree_key);
1216
1217         /* IOAPIC ranges shouldn't be accessed by DMA */
1218         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1219                 IOVA_PFN(IOAPIC_RANGE_END));
1220         if (!iova)
1221                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1222
1223         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1224         for_each_pci_dev(pdev) {
1225                 struct resource *r;
1226
1227                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1228                         r = &pdev->resource[i];
1229                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1230                                 continue;
1231                         addr = r->start;
1232                         addr &= PAGE_MASK;
1233                         size = r->end - addr;
1234                         size = PAGE_ALIGN(size);
1235                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1236                                 IOVA_PFN(size + addr) - 1);
1237                         if (!iova)
1238                                 printk(KERN_ERR "Reserve iova failed\n");
1239                 }
1240         }
1241
1242 }
1243
1244 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1245 {
1246         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1247 }
1248
1249 static inline int guestwidth_to_adjustwidth(int gaw)
1250 {
1251         int agaw;
1252         int r = (gaw - 12) % 9;
1253
1254         if (r == 0)
1255                 agaw = gaw;
1256         else
1257                 agaw = gaw + 9 - r;
1258         if (agaw > 64)
1259                 agaw = 64;
1260         return agaw;
1261 }
1262
1263 static int domain_init(struct dmar_domain *domain, int guest_width)
1264 {
1265         struct intel_iommu *iommu;
1266         int adjust_width, agaw;
1267         unsigned long sagaw;
1268
1269         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1270         spin_lock_init(&domain->mapping_lock);
1271
1272         domain_reserve_special_ranges(domain);
1273
1274         /* calculate AGAW */
1275         iommu = domain->iommu;
1276         if (guest_width > cap_mgaw(iommu->cap))
1277                 guest_width = cap_mgaw(iommu->cap);
1278         domain->gaw = guest_width;
1279         adjust_width = guestwidth_to_adjustwidth(guest_width);
1280         agaw = width_to_agaw(adjust_width);
1281         sagaw = cap_sagaw(iommu->cap);
1282         if (!test_bit(agaw, &sagaw)) {
1283                 /* hardware doesn't support it, choose a bigger one */
1284                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1285                 agaw = find_next_bit(&sagaw, 5, agaw);
1286                 if (agaw >= 5)
1287                         return -ENODEV;
1288         }
1289         domain->agaw = agaw;
1290         INIT_LIST_HEAD(&domain->devices);
1291
1292         /* always allocate the top pgd */
1293         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1294         if (!domain->pgd)
1295                 return -ENOMEM;
1296         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1297         return 0;
1298 }
1299
1300 static void domain_exit(struct dmar_domain *domain)
1301 {
1302         u64 end;
1303
1304         /* Domain 0 is reserved, so dont process it */
1305         if (!domain)
1306                 return;
1307
1308         domain_remove_dev_info(domain);
1309         /* destroy iovas */
1310         put_iova_domain(&domain->iovad);
1311         end = DOMAIN_MAX_ADDR(domain->gaw);
1312         end = end & (~PAGE_MASK);
1313
1314         /* clear ptes */
1315         dma_pte_clear_range(domain, 0, end);
1316
1317         /* free page tables */
1318         dma_pte_free_pagetable(domain, 0, end);
1319
1320         iommu_free_domain(domain);
1321         free_domain_mem(domain);
1322 }
1323
1324 static int domain_context_mapping_one(struct dmar_domain *domain,
1325                 u8 bus, u8 devfn)
1326 {
1327         struct context_entry *context;
1328         struct intel_iommu *iommu = domain->iommu;
1329         unsigned long flags;
1330
1331         pr_debug("Set context mapping for %02x:%02x.%d\n",
1332                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1333         BUG_ON(!domain->pgd);
1334         context = device_to_context_entry(iommu, bus, devfn);
1335         if (!context)
1336                 return -ENOMEM;
1337         spin_lock_irqsave(&iommu->lock, flags);
1338         if (context_present(context)) {
1339                 spin_unlock_irqrestore(&iommu->lock, flags);
1340                 return 0;
1341         }
1342
1343         context_set_domain_id(context, domain->id);
1344         context_set_address_width(context, domain->agaw);
1345         context_set_address_root(context, virt_to_phys(domain->pgd));
1346         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1347         context_set_fault_enable(context);
1348         context_set_present(context);
1349         __iommu_flush_cache(iommu, context, sizeof(*context));
1350
1351         /* it's a non-present to present mapping */
1352         if (iommu->flush.flush_context(iommu, domain->id,
1353                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1354                 DMA_CCMD_DEVICE_INVL, 1))
1355                 iommu_flush_write_buffer(iommu);
1356         else
1357                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1358
1359         spin_unlock_irqrestore(&iommu->lock, flags);
1360         return 0;
1361 }
1362
1363 static int
1364 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1365 {
1366         int ret;
1367         struct pci_dev *tmp, *parent;
1368
1369         ret = domain_context_mapping_one(domain, pdev->bus->number,
1370                 pdev->devfn);
1371         if (ret)
1372                 return ret;
1373
1374         /* dependent device mapping */
1375         tmp = pci_find_upstream_pcie_bridge(pdev);
1376         if (!tmp)
1377                 return 0;
1378         /* Secondary interface's bus number and devfn 0 */
1379         parent = pdev->bus->self;
1380         while (parent != tmp) {
1381                 ret = domain_context_mapping_one(domain, parent->bus->number,
1382                         parent->devfn);
1383                 if (ret)
1384                         return ret;
1385                 parent = parent->bus->self;
1386         }
1387         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1388                 return domain_context_mapping_one(domain,
1389                         tmp->subordinate->number, 0);
1390         else /* this is a legacy PCI bridge */
1391                 return domain_context_mapping_one(domain,
1392                         tmp->bus->number, tmp->devfn);
1393 }
1394
1395 static int domain_context_mapped(struct dmar_domain *domain,
1396         struct pci_dev *pdev)
1397 {
1398         int ret;
1399         struct pci_dev *tmp, *parent;
1400
1401         ret = device_context_mapped(domain->iommu,
1402                 pdev->bus->number, pdev->devfn);
1403         if (!ret)
1404                 return ret;
1405         /* dependent device mapping */
1406         tmp = pci_find_upstream_pcie_bridge(pdev);
1407         if (!tmp)
1408                 return ret;
1409         /* Secondary interface's bus number and devfn 0 */
1410         parent = pdev->bus->self;
1411         while (parent != tmp) {
1412                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1413                         parent->devfn);
1414                 if (!ret)
1415                         return ret;
1416                 parent = parent->bus->self;
1417         }
1418         if (tmp->is_pcie)
1419                 return device_context_mapped(domain->iommu,
1420                         tmp->subordinate->number, 0);
1421         else
1422                 return device_context_mapped(domain->iommu,
1423                         tmp->bus->number, tmp->devfn);
1424 }
1425
1426 static int
1427 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1428                         u64 hpa, size_t size, int prot)
1429 {
1430         u64 start_pfn, end_pfn;
1431         struct dma_pte *pte;
1432         int index;
1433         int addr_width = agaw_to_width(domain->agaw);
1434
1435         hpa &= (((u64)1) << addr_width) - 1;
1436
1437         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1438                 return -EINVAL;
1439         iova &= PAGE_MASK;
1440         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1441         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1442         index = 0;
1443         while (start_pfn < end_pfn) {
1444                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1445                 if (!pte)
1446                         return -ENOMEM;
1447                 /* We don't need lock here, nobody else
1448                  * touches the iova range
1449                  */
1450                 BUG_ON(dma_pte_addr(pte));
1451                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1452                 dma_set_pte_prot(pte, prot);
1453                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1454                 start_pfn++;
1455                 index++;
1456         }
1457         return 0;
1458 }
1459
1460 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1461 {
1462         clear_context_table(domain->iommu, bus, devfn);
1463         domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1464                                            DMA_CCMD_GLOBAL_INVL, 0);
1465         domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1466                                          DMA_TLB_GLOBAL_FLUSH, 0);
1467 }
1468
1469 static void domain_remove_dev_info(struct dmar_domain *domain)
1470 {
1471         struct device_domain_info *info;
1472         unsigned long flags;
1473
1474         spin_lock_irqsave(&device_domain_lock, flags);
1475         while (!list_empty(&domain->devices)) {
1476                 info = list_entry(domain->devices.next,
1477                         struct device_domain_info, link);
1478                 list_del(&info->link);
1479                 list_del(&info->global);
1480                 if (info->dev)
1481                         info->dev->dev.archdata.iommu = NULL;
1482                 spin_unlock_irqrestore(&device_domain_lock, flags);
1483
1484                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1485                 free_devinfo_mem(info);
1486
1487                 spin_lock_irqsave(&device_domain_lock, flags);
1488         }
1489         spin_unlock_irqrestore(&device_domain_lock, flags);
1490 }
1491
1492 /*
1493  * find_domain
1494  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1495  */
1496 static struct dmar_domain *
1497 find_domain(struct pci_dev *pdev)
1498 {
1499         struct device_domain_info *info;
1500
1501         /* No lock here, assumes no domain exit in normal case */
1502         info = pdev->dev.archdata.iommu;
1503         if (info)
1504                 return info->domain;
1505         return NULL;
1506 }
1507
1508 /* domain is initialized */
1509 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1510 {
1511         struct dmar_domain *domain, *found = NULL;
1512         struct intel_iommu *iommu;
1513         struct dmar_drhd_unit *drhd;
1514         struct device_domain_info *info, *tmp;
1515         struct pci_dev *dev_tmp;
1516         unsigned long flags;
1517         int bus = 0, devfn = 0;
1518
1519         domain = find_domain(pdev);
1520         if (domain)
1521                 return domain;
1522
1523         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1524         if (dev_tmp) {
1525                 if (dev_tmp->is_pcie) {
1526                         bus = dev_tmp->subordinate->number;
1527                         devfn = 0;
1528                 } else {
1529                         bus = dev_tmp->bus->number;
1530                         devfn = dev_tmp->devfn;
1531                 }
1532                 spin_lock_irqsave(&device_domain_lock, flags);
1533                 list_for_each_entry(info, &device_domain_list, global) {
1534                         if (info->bus == bus && info->devfn == devfn) {
1535                                 found = info->domain;
1536                                 break;
1537                         }
1538                 }
1539                 spin_unlock_irqrestore(&device_domain_lock, flags);
1540                 /* pcie-pci bridge already has a domain, uses it */
1541                 if (found) {
1542                         domain = found;
1543                         goto found_domain;
1544                 }
1545         }
1546
1547         /* Allocate new domain for the device */
1548         drhd = dmar_find_matched_drhd_unit(pdev);
1549         if (!drhd) {
1550                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1551                         pci_name(pdev));
1552                 return NULL;
1553         }
1554         iommu = drhd->iommu;
1555
1556         domain = iommu_alloc_domain(iommu);
1557         if (!domain)
1558                 goto error;
1559
1560         if (domain_init(domain, gaw)) {
1561                 domain_exit(domain);
1562                 goto error;
1563         }
1564
1565         /* register pcie-to-pci device */
1566         if (dev_tmp) {
1567                 info = alloc_devinfo_mem();
1568                 if (!info) {
1569                         domain_exit(domain);
1570                         goto error;
1571                 }
1572                 info->bus = bus;
1573                 info->devfn = devfn;
1574                 info->dev = NULL;
1575                 info->domain = domain;
1576                 /* This domain is shared by devices under p2p bridge */
1577                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1578
1579                 /* pcie-to-pci bridge already has a domain, uses it */
1580                 found = NULL;
1581                 spin_lock_irqsave(&device_domain_lock, flags);
1582                 list_for_each_entry(tmp, &device_domain_list, global) {
1583                         if (tmp->bus == bus && tmp->devfn == devfn) {
1584                                 found = tmp->domain;
1585                                 break;
1586                         }
1587                 }
1588                 if (found) {
1589                         free_devinfo_mem(info);
1590                         domain_exit(domain);
1591                         domain = found;
1592                 } else {
1593                         list_add(&info->link, &domain->devices);
1594                         list_add(&info->global, &device_domain_list);
1595                 }
1596                 spin_unlock_irqrestore(&device_domain_lock, flags);
1597         }
1598
1599 found_domain:
1600         info = alloc_devinfo_mem();
1601         if (!info)
1602                 goto error;
1603         info->bus = pdev->bus->number;
1604         info->devfn = pdev->devfn;
1605         info->dev = pdev;
1606         info->domain = domain;
1607         spin_lock_irqsave(&device_domain_lock, flags);
1608         /* somebody is fast */
1609         found = find_domain(pdev);
1610         if (found != NULL) {
1611                 spin_unlock_irqrestore(&device_domain_lock, flags);
1612                 if (found != domain) {
1613                         domain_exit(domain);
1614                         domain = found;
1615                 }
1616                 free_devinfo_mem(info);
1617                 return domain;
1618         }
1619         list_add(&info->link, &domain->devices);
1620         list_add(&info->global, &device_domain_list);
1621         pdev->dev.archdata.iommu = info;
1622         spin_unlock_irqrestore(&device_domain_lock, flags);
1623         return domain;
1624 error:
1625         /* recheck it here, maybe others set it */
1626         return find_domain(pdev);
1627 }
1628
1629 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1630                                       unsigned long long start,
1631                                       unsigned long long end)
1632 {
1633         struct dmar_domain *domain;
1634         unsigned long size;
1635         unsigned long long base;
1636         int ret;
1637
1638         printk(KERN_INFO
1639                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1640                 pci_name(pdev), start, end);
1641         /* page table init */
1642         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1643         if (!domain)
1644                 return -ENOMEM;
1645
1646         /* The address might not be aligned */
1647         base = start & PAGE_MASK;
1648         size = end - base;
1649         size = PAGE_ALIGN(size);
1650         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1651                         IOVA_PFN(base + size) - 1)) {
1652                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1653                 ret = -ENOMEM;
1654                 goto error;
1655         }
1656
1657         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1658                 size, base, pci_name(pdev));
1659         /*
1660          * RMRR range might have overlap with physical memory range,
1661          * clear it first
1662          */
1663         dma_pte_clear_range(domain, base, base + size);
1664
1665         ret = domain_page_mapping(domain, base, base, size,
1666                 DMA_PTE_READ|DMA_PTE_WRITE);
1667         if (ret)
1668                 goto error;
1669
1670         /* context entry init */
1671         ret = domain_context_mapping(domain, pdev);
1672         if (!ret)
1673                 return 0;
1674 error:
1675         domain_exit(domain);
1676         return ret;
1677
1678 }
1679
1680 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1681         struct pci_dev *pdev)
1682 {
1683         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1684                 return 0;
1685         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1686                 rmrr->end_address + 1);
1687 }
1688
1689 #ifdef CONFIG_DMAR_GFX_WA
1690 struct iommu_prepare_data {
1691         struct pci_dev *pdev;
1692         int ret;
1693 };
1694
1695 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1696                                          unsigned long end_pfn, void *datax)
1697 {
1698         struct iommu_prepare_data *data;
1699
1700         data = (struct iommu_prepare_data *)datax;
1701
1702         data->ret = iommu_prepare_identity_map(data->pdev,
1703                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1704         return data->ret;
1705
1706 }
1707
1708 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1709 {
1710         int nid;
1711         struct iommu_prepare_data data;
1712
1713         data.pdev = pdev;
1714         data.ret = 0;
1715
1716         for_each_online_node(nid) {
1717                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1718                 if (data.ret)
1719                         return data.ret;
1720         }
1721         return data.ret;
1722 }
1723
1724 static void __init iommu_prepare_gfx_mapping(void)
1725 {
1726         struct pci_dev *pdev = NULL;
1727         int ret;
1728
1729         for_each_pci_dev(pdev) {
1730                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1731                                 !IS_GFX_DEVICE(pdev))
1732                         continue;
1733                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1734                         pci_name(pdev));
1735                 ret = iommu_prepare_with_active_regions(pdev);
1736                 if (ret)
1737                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1738         }
1739 }
1740 #else /* !CONFIG_DMAR_GFX_WA */
1741 static inline void iommu_prepare_gfx_mapping(void)
1742 {
1743         return;
1744 }
1745 #endif
1746
1747 #ifdef CONFIG_DMAR_FLOPPY_WA
1748 static inline void iommu_prepare_isa(void)
1749 {
1750         struct pci_dev *pdev;
1751         int ret;
1752
1753         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1754         if (!pdev)
1755                 return;
1756
1757         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1758         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1759
1760         if (ret)
1761                 printk("IOMMU: Failed to create 0-64M identity map, "
1762                         "floppy might not work\n");
1763
1764 }
1765 #else
1766 static inline void iommu_prepare_isa(void)
1767 {
1768         return;
1769 }
1770 #endif /* !CONFIG_DMAR_FLPY_WA */
1771
1772 static int __init init_dmars(void)
1773 {
1774         struct dmar_drhd_unit *drhd;
1775         struct dmar_rmrr_unit *rmrr;
1776         struct pci_dev *pdev;
1777         struct intel_iommu *iommu;
1778         int i, ret, unit = 0;
1779
1780         /*
1781          * for each drhd
1782          *    allocate root
1783          *    initialize and program root entry to not present
1784          * endfor
1785          */
1786         for_each_drhd_unit(drhd) {
1787                 g_num_of_iommus++;
1788                 /*
1789                  * lock not needed as this is only incremented in the single
1790                  * threaded kernel __init code path all other access are read
1791                  * only
1792                  */
1793         }
1794
1795         deferred_flush = kzalloc(g_num_of_iommus *
1796                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1797         if (!deferred_flush) {
1798                 ret = -ENOMEM;
1799                 goto error;
1800         }
1801
1802         for_each_drhd_unit(drhd) {
1803                 if (drhd->ignored)
1804                         continue;
1805
1806                 iommu = drhd->iommu;
1807
1808                 ret = iommu_init_domains(iommu);
1809                 if (ret)
1810                         goto error;
1811
1812                 /*
1813                  * TBD:
1814                  * we could share the same root & context tables
1815                  * amoung all IOMMU's. Need to Split it later.
1816                  */
1817                 ret = iommu_alloc_root_entry(iommu);
1818                 if (ret) {
1819                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1820                         goto error;
1821                 }
1822         }
1823
1824         for_each_drhd_unit(drhd) {
1825                 if (drhd->ignored)
1826                         continue;
1827
1828                 iommu = drhd->iommu;
1829                 if (dmar_enable_qi(iommu)) {
1830                         /*
1831                          * Queued Invalidate not enabled, use Register Based
1832                          * Invalidate
1833                          */
1834                         iommu->flush.flush_context = __iommu_flush_context;
1835                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1836                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1837                                "invalidation\n",
1838                                (unsigned long long)drhd->reg_base_addr);
1839                 } else {
1840                         iommu->flush.flush_context = qi_flush_context;
1841                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1842                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1843                                "invalidation\n",
1844                                (unsigned long long)drhd->reg_base_addr);
1845                 }
1846         }
1847
1848         /*
1849          * For each rmrr
1850          *   for each dev attached to rmrr
1851          *   do
1852          *     locate drhd for dev, alloc domain for dev
1853          *     allocate free domain
1854          *     allocate page table entries for rmrr
1855          *     if context not allocated for bus
1856          *           allocate and init context
1857          *           set present in root table for this bus
1858          *     init context with domain, translation etc
1859          *    endfor
1860          * endfor
1861          */
1862         for_each_rmrr_units(rmrr) {
1863                 for (i = 0; i < rmrr->devices_cnt; i++) {
1864                         pdev = rmrr->devices[i];
1865                         /* some BIOS lists non-exist devices in DMAR table */
1866                         if (!pdev)
1867                                 continue;
1868                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1869                         if (ret)
1870                                 printk(KERN_ERR
1871                                  "IOMMU: mapping reserved region failed\n");
1872                 }
1873         }
1874
1875         iommu_prepare_gfx_mapping();
1876
1877         iommu_prepare_isa();
1878
1879         /*
1880          * for each drhd
1881          *   enable fault log
1882          *   global invalidate context cache
1883          *   global invalidate iotlb
1884          *   enable translation
1885          */
1886         for_each_drhd_unit(drhd) {
1887                 if (drhd->ignored)
1888                         continue;
1889                 iommu = drhd->iommu;
1890                 sprintf (iommu->name, "dmar%d", unit++);
1891
1892                 iommu_flush_write_buffer(iommu);
1893
1894                 ret = dmar_set_interrupt(iommu);
1895                 if (ret)
1896                         goto error;
1897
1898                 iommu_set_root_entry(iommu);
1899
1900                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1901                                            0);
1902                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1903                                          0);
1904                 iommu_disable_protect_mem_regions(iommu);
1905
1906                 ret = iommu_enable_translation(iommu);
1907                 if (ret)
1908                         goto error;
1909         }
1910
1911         return 0;
1912 error:
1913         for_each_drhd_unit(drhd) {
1914                 if (drhd->ignored)
1915                         continue;
1916                 iommu = drhd->iommu;
1917                 free_iommu(iommu);
1918         }
1919         return ret;
1920 }
1921
1922 static inline u64 aligned_size(u64 host_addr, size_t size)
1923 {
1924         u64 addr;
1925         addr = (host_addr & (~PAGE_MASK)) + size;
1926         return PAGE_ALIGN(addr);
1927 }
1928
1929 struct iova *
1930 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1931 {
1932         struct iova *piova;
1933
1934         /* Make sure it's in range */
1935         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1936         if (!size || (IOVA_START_ADDR + size > end))
1937                 return NULL;
1938
1939         piova = alloc_iova(&domain->iovad,
1940                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1941         return piova;
1942 }
1943
1944 static struct iova *
1945 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1946                    size_t size, u64 dma_mask)
1947 {
1948         struct pci_dev *pdev = to_pci_dev(dev);
1949         struct iova *iova = NULL;
1950
1951         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1952                 iova = iommu_alloc_iova(domain, size, dma_mask);
1953         else {
1954                 /*
1955                  * First try to allocate an io virtual address in
1956                  * DMA_32BIT_MASK and if that fails then try allocating
1957                  * from higher range
1958                  */
1959                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1960                 if (!iova)
1961                         iova = iommu_alloc_iova(domain, size, dma_mask);
1962         }
1963
1964         if (!iova) {
1965                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1966                 return NULL;
1967         }
1968
1969         return iova;
1970 }
1971
1972 static struct dmar_domain *
1973 get_valid_domain_for_dev(struct pci_dev *pdev)
1974 {
1975         struct dmar_domain *domain;
1976         int ret;
1977
1978         domain = get_domain_for_dev(pdev,
1979                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1980         if (!domain) {
1981                 printk(KERN_ERR
1982                         "Allocating domain for %s failed", pci_name(pdev));
1983                 return NULL;
1984         }
1985
1986         /* make sure context mapping is ok */
1987         if (unlikely(!domain_context_mapped(domain, pdev))) {
1988                 ret = domain_context_mapping(domain, pdev);
1989                 if (ret) {
1990                         printk(KERN_ERR
1991                                 "Domain context map for %s failed",
1992                                 pci_name(pdev));
1993                         return NULL;
1994                 }
1995         }
1996
1997         return domain;
1998 }
1999
2000 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2001                                      size_t size, int dir, u64 dma_mask)
2002 {
2003         struct pci_dev *pdev = to_pci_dev(hwdev);
2004         struct dmar_domain *domain;
2005         phys_addr_t start_paddr;
2006         struct iova *iova;
2007         int prot = 0;
2008         int ret;
2009
2010         BUG_ON(dir == DMA_NONE);
2011         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2012                 return paddr;
2013
2014         domain = get_valid_domain_for_dev(pdev);
2015         if (!domain)
2016                 return 0;
2017
2018         size = aligned_size((u64)paddr, size);
2019
2020         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2021         if (!iova)
2022                 goto error;
2023
2024         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2025
2026         /*
2027          * Check if DMAR supports zero-length reads on write only
2028          * mappings..
2029          */
2030         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2031                         !cap_zlr(domain->iommu->cap))
2032                 prot |= DMA_PTE_READ;
2033         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2034                 prot |= DMA_PTE_WRITE;
2035         /*
2036          * paddr - (paddr + size) might be partial page, we should map the whole
2037          * page.  Note: if two part of one page are separately mapped, we
2038          * might have two guest_addr mapping to the same host paddr, but this
2039          * is not a big problem
2040          */
2041         ret = domain_page_mapping(domain, start_paddr,
2042                 ((u64)paddr) & PAGE_MASK, size, prot);
2043         if (ret)
2044                 goto error;
2045
2046         /* it's a non-present to present mapping */
2047         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
2048                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2049         if (ret)
2050                 iommu_flush_write_buffer(domain->iommu);
2051
2052         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2053
2054 error:
2055         if (iova)
2056                 __free_iova(&domain->iovad, iova);
2057         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2058                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2059         return 0;
2060 }
2061
2062 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2063                             size_t size, int dir)
2064 {
2065         return __intel_map_single(hwdev, paddr, size, dir,
2066                                   to_pci_dev(hwdev)->dma_mask);
2067 }
2068
2069 static void flush_unmaps(void)
2070 {
2071         int i, j;
2072
2073         timer_on = 0;
2074
2075         /* just flush them all */
2076         for (i = 0; i < g_num_of_iommus; i++) {
2077                 if (deferred_flush[i].next) {
2078                         struct intel_iommu *iommu =
2079                                 deferred_flush[i].domain[0]->iommu;
2080
2081                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2082                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2083                         for (j = 0; j < deferred_flush[i].next; j++) {
2084                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2085                                                 deferred_flush[i].iova[j]);
2086                         }
2087                         deferred_flush[i].next = 0;
2088                 }
2089         }
2090
2091         list_size = 0;
2092 }
2093
2094 static void flush_unmaps_timeout(unsigned long data)
2095 {
2096         unsigned long flags;
2097
2098         spin_lock_irqsave(&async_umap_flush_lock, flags);
2099         flush_unmaps();
2100         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2101 }
2102
2103 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2104 {
2105         unsigned long flags;
2106         int next, iommu_id;
2107
2108         spin_lock_irqsave(&async_umap_flush_lock, flags);
2109         if (list_size == HIGH_WATER_MARK)
2110                 flush_unmaps();
2111
2112         iommu_id = dom->iommu->seq_id;
2113
2114         next = deferred_flush[iommu_id].next;
2115         deferred_flush[iommu_id].domain[next] = dom;
2116         deferred_flush[iommu_id].iova[next] = iova;
2117         deferred_flush[iommu_id].next++;
2118
2119         if (!timer_on) {
2120                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2121                 timer_on = 1;
2122         }
2123         list_size++;
2124         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2125 }
2126
2127 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2128                         int dir)
2129 {
2130         struct pci_dev *pdev = to_pci_dev(dev);
2131         struct dmar_domain *domain;
2132         unsigned long start_addr;
2133         struct iova *iova;
2134
2135         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2136                 return;
2137         domain = find_domain(pdev);
2138         BUG_ON(!domain);
2139
2140         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2141         if (!iova)
2142                 return;
2143
2144         start_addr = iova->pfn_lo << PAGE_SHIFT;
2145         size = aligned_size((u64)dev_addr, size);
2146
2147         pr_debug("Device %s unmapping: %lx@%llx\n",
2148                 pci_name(pdev), size, (unsigned long long)start_addr);
2149
2150         /*  clear the whole page */
2151         dma_pte_clear_range(domain, start_addr, start_addr + size);
2152         /* free page tables */
2153         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2154         if (intel_iommu_strict) {
2155                 if (iommu_flush_iotlb_psi(domain->iommu,
2156                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2157                         iommu_flush_write_buffer(domain->iommu);
2158                 /* free iova */
2159                 __free_iova(&domain->iovad, iova);
2160         } else {
2161                 add_unmap(domain, iova);
2162                 /*
2163                  * queue up the release of the unmap to save the 1/6th of the
2164                  * cpu used up by the iotlb flush operation...
2165                  */
2166         }
2167 }
2168
2169 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2170                            dma_addr_t *dma_handle, gfp_t flags)
2171 {
2172         void *vaddr;
2173         int order;
2174
2175         size = PAGE_ALIGN(size);
2176         order = get_order(size);
2177         flags &= ~(GFP_DMA | GFP_DMA32);
2178
2179         vaddr = (void *)__get_free_pages(flags, order);
2180         if (!vaddr)
2181                 return NULL;
2182         memset(vaddr, 0, size);
2183
2184         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2185                                          DMA_BIDIRECTIONAL,
2186                                          hwdev->coherent_dma_mask);
2187         if (*dma_handle)
2188                 return vaddr;
2189         free_pages((unsigned long)vaddr, order);
2190         return NULL;
2191 }
2192
2193 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2194                          dma_addr_t dma_handle)
2195 {
2196         int order;
2197
2198         size = PAGE_ALIGN(size);
2199         order = get_order(size);
2200
2201         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2202         free_pages((unsigned long)vaddr, order);
2203 }
2204
2205 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2206
2207 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2208                     int nelems, int dir)
2209 {
2210         int i;
2211         struct pci_dev *pdev = to_pci_dev(hwdev);
2212         struct dmar_domain *domain;
2213         unsigned long start_addr;
2214         struct iova *iova;
2215         size_t size = 0;
2216         void *addr;
2217         struct scatterlist *sg;
2218
2219         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2220                 return;
2221
2222         domain = find_domain(pdev);
2223
2224         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2225         if (!iova)
2226                 return;
2227         for_each_sg(sglist, sg, nelems, i) {
2228                 addr = SG_ENT_VIRT_ADDRESS(sg);
2229                 size += aligned_size((u64)addr, sg->length);
2230         }
2231
2232         start_addr = iova->pfn_lo << PAGE_SHIFT;
2233
2234         /*  clear the whole page */
2235         dma_pte_clear_range(domain, start_addr, start_addr + size);
2236         /* free page tables */
2237         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2238
2239         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2240                         size >> VTD_PAGE_SHIFT, 0))
2241                 iommu_flush_write_buffer(domain->iommu);
2242
2243         /* free iova */
2244         __free_iova(&domain->iovad, iova);
2245 }
2246
2247 static int intel_nontranslate_map_sg(struct device *hddev,
2248         struct scatterlist *sglist, int nelems, int dir)
2249 {
2250         int i;
2251         struct scatterlist *sg;
2252
2253         for_each_sg(sglist, sg, nelems, i) {
2254                 BUG_ON(!sg_page(sg));
2255                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2256                 sg->dma_length = sg->length;
2257         }
2258         return nelems;
2259 }
2260
2261 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2262                  int dir)
2263 {
2264         void *addr;
2265         int i;
2266         struct pci_dev *pdev = to_pci_dev(hwdev);
2267         struct dmar_domain *domain;
2268         size_t size = 0;
2269         int prot = 0;
2270         size_t offset = 0;
2271         struct iova *iova = NULL;
2272         int ret;
2273         struct scatterlist *sg;
2274         unsigned long start_addr;
2275
2276         BUG_ON(dir == DMA_NONE);
2277         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2278                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2279
2280         domain = get_valid_domain_for_dev(pdev);
2281         if (!domain)
2282                 return 0;
2283
2284         for_each_sg(sglist, sg, nelems, i) {
2285                 addr = SG_ENT_VIRT_ADDRESS(sg);
2286                 addr = (void *)virt_to_phys(addr);
2287                 size += aligned_size((u64)addr, sg->length);
2288         }
2289
2290         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2291         if (!iova) {
2292                 sglist->dma_length = 0;
2293                 return 0;
2294         }
2295
2296         /*
2297          * Check if DMAR supports zero-length reads on write only
2298          * mappings..
2299          */
2300         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2301                         !cap_zlr(domain->iommu->cap))
2302                 prot |= DMA_PTE_READ;
2303         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2304                 prot |= DMA_PTE_WRITE;
2305
2306         start_addr = iova->pfn_lo << PAGE_SHIFT;
2307         offset = 0;
2308         for_each_sg(sglist, sg, nelems, i) {
2309                 addr = SG_ENT_VIRT_ADDRESS(sg);
2310                 addr = (void *)virt_to_phys(addr);
2311                 size = aligned_size((u64)addr, sg->length);
2312                 ret = domain_page_mapping(domain, start_addr + offset,
2313                         ((u64)addr) & PAGE_MASK,
2314                         size, prot);
2315                 if (ret) {
2316                         /*  clear the page */
2317                         dma_pte_clear_range(domain, start_addr,
2318                                   start_addr + offset);
2319                         /* free page tables */
2320                         dma_pte_free_pagetable(domain, start_addr,
2321                                   start_addr + offset);
2322                         /* free iova */
2323                         __free_iova(&domain->iovad, iova);
2324                         return 0;
2325                 }
2326                 sg->dma_address = start_addr + offset +
2327                                 ((u64)addr & (~PAGE_MASK));
2328                 sg->dma_length = sg->length;
2329                 offset += size;
2330         }
2331
2332         /* it's a non-present to present mapping */
2333         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2334                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2335                 iommu_flush_write_buffer(domain->iommu);
2336         return nelems;
2337 }
2338
2339 static struct dma_mapping_ops intel_dma_ops = {
2340         .alloc_coherent = intel_alloc_coherent,
2341         .free_coherent = intel_free_coherent,
2342         .map_single = intel_map_single,
2343         .unmap_single = intel_unmap_single,
2344         .map_sg = intel_map_sg,
2345         .unmap_sg = intel_unmap_sg,
2346 };
2347
2348 static inline int iommu_domain_cache_init(void)
2349 {
2350         int ret = 0;
2351
2352         iommu_domain_cache = kmem_cache_create("iommu_domain",
2353                                          sizeof(struct dmar_domain),
2354                                          0,
2355                                          SLAB_HWCACHE_ALIGN,
2356
2357                                          NULL);
2358         if (!iommu_domain_cache) {
2359                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2360                 ret = -ENOMEM;
2361         }
2362
2363         return ret;
2364 }
2365
2366 static inline int iommu_devinfo_cache_init(void)
2367 {
2368         int ret = 0;
2369
2370         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2371                                          sizeof(struct device_domain_info),
2372                                          0,
2373                                          SLAB_HWCACHE_ALIGN,
2374                                          NULL);
2375         if (!iommu_devinfo_cache) {
2376                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2377                 ret = -ENOMEM;
2378         }
2379
2380         return ret;
2381 }
2382
2383 static inline int iommu_iova_cache_init(void)
2384 {
2385         int ret = 0;
2386
2387         iommu_iova_cache = kmem_cache_create("iommu_iova",
2388                                          sizeof(struct iova),
2389                                          0,
2390                                          SLAB_HWCACHE_ALIGN,
2391                                          NULL);
2392         if (!iommu_iova_cache) {
2393                 printk(KERN_ERR "Couldn't create iova cache\n");
2394                 ret = -ENOMEM;
2395         }
2396
2397         return ret;
2398 }
2399
2400 static int __init iommu_init_mempool(void)
2401 {
2402         int ret;
2403         ret = iommu_iova_cache_init();
2404         if (ret)
2405                 return ret;
2406
2407         ret = iommu_domain_cache_init();
2408         if (ret)
2409                 goto domain_error;
2410
2411         ret = iommu_devinfo_cache_init();
2412         if (!ret)
2413                 return ret;
2414
2415         kmem_cache_destroy(iommu_domain_cache);
2416 domain_error:
2417         kmem_cache_destroy(iommu_iova_cache);
2418
2419         return -ENOMEM;
2420 }
2421
2422 static void __init iommu_exit_mempool(void)
2423 {
2424         kmem_cache_destroy(iommu_devinfo_cache);
2425         kmem_cache_destroy(iommu_domain_cache);
2426         kmem_cache_destroy(iommu_iova_cache);
2427
2428 }
2429
2430 static void __init init_no_remapping_devices(void)
2431 {
2432         struct dmar_drhd_unit *drhd;
2433
2434         for_each_drhd_unit(drhd) {
2435                 if (!drhd->include_all) {
2436                         int i;
2437                         for (i = 0; i < drhd->devices_cnt; i++)
2438                                 if (drhd->devices[i] != NULL)
2439                                         break;
2440                         /* ignore DMAR unit if no pci devices exist */
2441                         if (i == drhd->devices_cnt)
2442                                 drhd->ignored = 1;
2443                 }
2444         }
2445
2446         if (dmar_map_gfx)
2447                 return;
2448
2449         for_each_drhd_unit(drhd) {
2450                 int i;
2451                 if (drhd->ignored || drhd->include_all)
2452                         continue;
2453
2454                 for (i = 0; i < drhd->devices_cnt; i++)
2455                         if (drhd->devices[i] &&
2456                                 !IS_GFX_DEVICE(drhd->devices[i]))
2457                                 break;
2458
2459                 if (i < drhd->devices_cnt)
2460                         continue;
2461
2462                 /* bypass IOMMU if it is just for gfx devices */
2463                 drhd->ignored = 1;
2464                 for (i = 0; i < drhd->devices_cnt; i++) {
2465                         if (!drhd->devices[i])
2466                                 continue;
2467                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2468                 }
2469         }
2470 }
2471
2472 int __init intel_iommu_init(void)
2473 {
2474         int ret = 0;
2475
2476         if (dmar_table_init())
2477                 return  -ENODEV;
2478
2479         if (dmar_dev_scope_init())
2480                 return  -ENODEV;
2481
2482         /*
2483          * Check the need for DMA-remapping initialization now.
2484          * Above initialization will also be used by Interrupt-remapping.
2485          */
2486         if (no_iommu || swiotlb || dmar_disabled)
2487                 return -ENODEV;
2488
2489         iommu_init_mempool();
2490         dmar_init_reserved_ranges();
2491
2492         init_no_remapping_devices();
2493
2494         ret = init_dmars();
2495         if (ret) {
2496                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2497                 put_iova_domain(&reserved_iova_list);
2498                 iommu_exit_mempool();
2499                 return ret;
2500         }
2501         printk(KERN_INFO
2502         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2503
2504         init_timer(&unmap_timer);
2505         force_iommu = 1;
2506         dma_ops = &intel_dma_ops;
2507         return 0;
2508 }
2509
2510 void intel_iommu_domain_exit(struct dmar_domain *domain)
2511 {
2512         u64 end;
2513
2514         /* Domain 0 is reserved, so dont process it */
2515         if (!domain)
2516                 return;
2517
2518         end = DOMAIN_MAX_ADDR(domain->gaw);
2519         end = end & (~VTD_PAGE_MASK);
2520
2521         /* clear ptes */
2522         dma_pte_clear_range(domain, 0, end);
2523
2524         /* free page tables */
2525         dma_pte_free_pagetable(domain, 0, end);
2526
2527         iommu_free_domain(domain);
2528         free_domain_mem(domain);
2529 }
2530 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2531
2532 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2533 {
2534         struct dmar_drhd_unit *drhd;
2535         struct dmar_domain *domain;
2536         struct intel_iommu *iommu;
2537
2538         drhd = dmar_find_matched_drhd_unit(pdev);
2539         if (!drhd) {
2540                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2541                 return NULL;
2542         }
2543
2544         iommu = drhd->iommu;
2545         if (!iommu) {
2546                 printk(KERN_ERR
2547                         "intel_iommu_domain_alloc: iommu == NULL\n");
2548                 return NULL;
2549         }
2550         domain = iommu_alloc_domain(iommu);
2551         if (!domain) {
2552                 printk(KERN_ERR
2553                         "intel_iommu_domain_alloc: domain == NULL\n");
2554                 return NULL;
2555         }
2556         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2557                 printk(KERN_ERR
2558                         "intel_iommu_domain_alloc: domain_init() failed\n");
2559                 intel_iommu_domain_exit(domain);
2560                 return NULL;
2561         }
2562         return domain;
2563 }
2564 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2565
2566 int intel_iommu_context_mapping(
2567         struct dmar_domain *domain, struct pci_dev *pdev)
2568 {
2569         int rc;
2570         rc = domain_context_mapping(domain, pdev);
2571         return rc;
2572 }
2573 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2574
2575 int intel_iommu_page_mapping(
2576         struct dmar_domain *domain, dma_addr_t iova,
2577         u64 hpa, size_t size, int prot)
2578 {
2579         int rc;
2580         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2581         return rc;
2582 }
2583 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2584
2585 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2586 {
2587         detach_domain_for_dev(domain, bus, devfn);
2588 }
2589 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2590
2591 struct dmar_domain *
2592 intel_iommu_find_domain(struct pci_dev *pdev)
2593 {
2594         return find_domain(pdev);
2595 }
2596 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2597
2598 int intel_iommu_found(void)
2599 {
2600         return g_num_of_iommus;
2601 }
2602 EXPORT_SYMBOL_GPL(intel_iommu_found);
2603
2604 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2605 {
2606         struct dma_pte *pte;
2607         u64 pfn;
2608
2609         pfn = 0;
2610         pte = addr_to_dma_pte(domain, iova);
2611
2612         if (pte)
2613                 pfn = dma_pte_addr(pte);
2614
2615         return pfn >> VTD_PAGE_SHIFT;
2616 }
2617 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);