2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define ROOT_SIZE VTD_PAGE_SIZE
44 #define CONTEXT_SIZE VTD_PAGE_SIZE
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49 #define IOAPIC_RANGE_START (0xfee00000)
50 #define IOAPIC_RANGE_END (0xfeefffff)
51 #define IOVA_START_ADDR (0x1000)
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
67 * 12-63: Context Ptr (12 - (haw-1))
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 static inline bool root_present(struct root_entry *root)
77 return (root->val & 1);
79 static inline void set_root_present(struct root_entry *root)
83 static inline void set_root_value(struct root_entry *root, unsigned long value)
85 root->val |= value & VTD_PAGE_MASK;
88 static inline struct context_entry *
89 get_context_addr_from_root(struct root_entry *root)
91 return (struct context_entry *)
92 (root_present(root)?phys_to_virt(
93 root->val & VTD_PAGE_MASK) :
100 * 1: fault processing disable
101 * 2-3: translation type
102 * 12-63: address space root
108 struct context_entry {
113 static inline bool context_present(struct context_entry *context)
115 return (context->lo & 1);
117 static inline void context_set_present(struct context_entry *context)
122 static inline void context_set_fault_enable(struct context_entry *context)
124 context->lo &= (((u64)-1) << 2) | 1;
127 #define CONTEXT_TT_MULTI_LEVEL 0
129 static inline void context_set_translation_type(struct context_entry *context,
132 context->lo &= (((u64)-1) << 4) | 3;
133 context->lo |= (value & 3) << 2;
136 static inline void context_set_address_root(struct context_entry *context,
139 context->lo |= value & VTD_PAGE_MASK;
142 static inline void context_set_address_width(struct context_entry *context,
145 context->hi |= value & 7;
148 static inline void context_set_domain_id(struct context_entry *context,
151 context->hi |= (value & ((1 << 16) - 1)) << 8;
154 static inline void context_clear_entry(struct context_entry *context)
166 * 12-63: Host physcial address
172 static inline void dma_clear_pte(struct dma_pte *pte)
177 static inline void dma_set_pte_readable(struct dma_pte *pte)
179 pte->val |= DMA_PTE_READ;
182 static inline void dma_set_pte_writable(struct dma_pte *pte)
184 pte->val |= DMA_PTE_WRITE;
187 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
189 pte->val = (pte->val & ~3) | (prot & 3);
192 static inline u64 dma_pte_addr(struct dma_pte *pte)
194 return (pte->val & VTD_PAGE_MASK);
197 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
199 pte->val |= (addr & VTD_PAGE_MASK);
202 static inline bool dma_pte_present(struct dma_pte *pte)
204 return (pte->val & 3) != 0;
207 /* devices under the same p2p bridge are owned in one domain */
208 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
210 /* domain represents a virtual machine, more than one devices
211 * across iommus may be owned in one domain, e.g. kvm guest.
213 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
216 int id; /* domain id */
217 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
219 struct list_head devices; /* all devices' list */
220 struct iova_domain iovad; /* iova's that belong to this domain */
222 struct dma_pte *pgd; /* virtual address */
223 spinlock_t mapping_lock; /* page table lock */
224 int gaw; /* max guest address width */
226 /* adjusted guest address width, 0 is level 2 30-bit */
229 int flags; /* flags to find out type of domain */
231 int iommu_coherency;/* indicate coherency of iommu access */
232 int iommu_count; /* reference count of iommu */
233 spinlock_t iommu_lock; /* protect iommu set in domain */
234 u64 max_addr; /* maximum mapped address */
237 /* PCI domain-device relationship */
238 struct device_domain_info {
239 struct list_head link; /* link to domain siblings */
240 struct list_head global; /* link to global list */
241 u8 bus; /* PCI bus numer */
242 u8 devfn; /* PCI devfn number */
243 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
244 struct dmar_domain *domain; /* pointer to domain */
247 static void flush_unmaps_timeout(unsigned long data);
249 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
251 #define HIGH_WATER_MARK 250
252 struct deferred_flush_tables {
254 struct iova *iova[HIGH_WATER_MARK];
255 struct dmar_domain *domain[HIGH_WATER_MARK];
258 static struct deferred_flush_tables *deferred_flush;
260 /* bitmap for indexing intel_iommus */
261 static int g_num_of_iommus;
263 static DEFINE_SPINLOCK(async_umap_flush_lock);
264 static LIST_HEAD(unmaps_to_do);
267 static long list_size;
269 static void domain_remove_dev_info(struct dmar_domain *domain);
272 static int __initdata dmar_map_gfx = 1;
273 static int dmar_forcedac;
274 static int intel_iommu_strict;
276 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
277 static DEFINE_SPINLOCK(device_domain_lock);
278 static LIST_HEAD(device_domain_list);
280 static struct iommu_ops intel_iommu_ops;
282 static int __init intel_iommu_setup(char *str)
287 if (!strncmp(str, "off", 3)) {
289 printk(KERN_INFO"Intel-IOMMU: disabled\n");
290 } else if (!strncmp(str, "igfx_off", 8)) {
293 "Intel-IOMMU: disable GFX device mapping\n");
294 } else if (!strncmp(str, "forcedac", 8)) {
296 "Intel-IOMMU: Forcing DAC for PCI devices\n");
298 } else if (!strncmp(str, "strict", 6)) {
300 "Intel-IOMMU: disable batched IOTLB flush\n");
301 intel_iommu_strict = 1;
304 str += strcspn(str, ",");
310 __setup("intel_iommu=", intel_iommu_setup);
312 static struct kmem_cache *iommu_domain_cache;
313 static struct kmem_cache *iommu_devinfo_cache;
314 static struct kmem_cache *iommu_iova_cache;
316 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
321 /* trying to avoid low memory issues */
322 flags = current->flags & PF_MEMALLOC;
323 current->flags |= PF_MEMALLOC;
324 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
325 current->flags &= (~PF_MEMALLOC | flags);
330 static inline void *alloc_pgtable_page(void)
335 /* trying to avoid low memory issues */
336 flags = current->flags & PF_MEMALLOC;
337 current->flags |= PF_MEMALLOC;
338 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
339 current->flags &= (~PF_MEMALLOC | flags);
343 static inline void free_pgtable_page(void *vaddr)
345 free_page((unsigned long)vaddr);
348 static inline void *alloc_domain_mem(void)
350 return iommu_kmem_cache_alloc(iommu_domain_cache);
353 static void free_domain_mem(void *vaddr)
355 kmem_cache_free(iommu_domain_cache, vaddr);
358 static inline void * alloc_devinfo_mem(void)
360 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
363 static inline void free_devinfo_mem(void *vaddr)
365 kmem_cache_free(iommu_devinfo_cache, vaddr);
368 struct iova *alloc_iova_mem(void)
370 return iommu_kmem_cache_alloc(iommu_iova_cache);
373 void free_iova_mem(struct iova *iova)
375 kmem_cache_free(iommu_iova_cache, iova);
379 static inline int width_to_agaw(int width);
381 /* calculate agaw for each iommu.
382 * "SAGAW" may be different across iommus, use a default agaw, and
383 * get a supported less agaw for iommus that don't support the default agaw.
385 int iommu_calculate_agaw(struct intel_iommu *iommu)
390 sagaw = cap_sagaw(iommu->cap);
391 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
393 if (test_bit(agaw, &sagaw))
400 /* in native case, each domain is related to only one iommu */
401 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
405 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
407 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
408 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
411 return g_iommus[iommu_id];
414 /* "Coherency" capability may be different across iommus */
415 static void domain_update_iommu_coherency(struct dmar_domain *domain)
419 domain->iommu_coherency = 1;
421 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
422 for (; i < g_num_of_iommus; ) {
423 if (!ecap_coherent(g_iommus[i]->ecap)) {
424 domain->iommu_coherency = 0;
427 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
431 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
433 struct dmar_drhd_unit *drhd = NULL;
436 for_each_drhd_unit(drhd) {
440 for (i = 0; i < drhd->devices_cnt; i++)
441 if (drhd->devices[i] &&
442 drhd->devices[i]->bus->number == bus &&
443 drhd->devices[i]->devfn == devfn)
446 if (drhd->include_all)
453 static void domain_flush_cache(struct dmar_domain *domain,
454 void *addr, int size)
456 if (!domain->iommu_coherency)
457 clflush_cache_range(addr, size);
460 /* Gets context entry for a given bus and devfn */
461 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
464 struct root_entry *root;
465 struct context_entry *context;
466 unsigned long phy_addr;
469 spin_lock_irqsave(&iommu->lock, flags);
470 root = &iommu->root_entry[bus];
471 context = get_context_addr_from_root(root);
473 context = (struct context_entry *)alloc_pgtable_page();
475 spin_unlock_irqrestore(&iommu->lock, flags);
478 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
479 phy_addr = virt_to_phys((void *)context);
480 set_root_value(root, phy_addr);
481 set_root_present(root);
482 __iommu_flush_cache(iommu, root, sizeof(*root));
484 spin_unlock_irqrestore(&iommu->lock, flags);
485 return &context[devfn];
488 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
490 struct root_entry *root;
491 struct context_entry *context;
495 spin_lock_irqsave(&iommu->lock, flags);
496 root = &iommu->root_entry[bus];
497 context = get_context_addr_from_root(root);
502 ret = context_present(&context[devfn]);
504 spin_unlock_irqrestore(&iommu->lock, flags);
508 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
510 struct root_entry *root;
511 struct context_entry *context;
514 spin_lock_irqsave(&iommu->lock, flags);
515 root = &iommu->root_entry[bus];
516 context = get_context_addr_from_root(root);
518 context_clear_entry(&context[devfn]);
519 __iommu_flush_cache(iommu, &context[devfn], \
522 spin_unlock_irqrestore(&iommu->lock, flags);
525 static void free_context_table(struct intel_iommu *iommu)
527 struct root_entry *root;
530 struct context_entry *context;
532 spin_lock_irqsave(&iommu->lock, flags);
533 if (!iommu->root_entry) {
536 for (i = 0; i < ROOT_ENTRY_NR; i++) {
537 root = &iommu->root_entry[i];
538 context = get_context_addr_from_root(root);
540 free_pgtable_page(context);
542 free_pgtable_page(iommu->root_entry);
543 iommu->root_entry = NULL;
545 spin_unlock_irqrestore(&iommu->lock, flags);
548 /* page table handling */
549 #define LEVEL_STRIDE (9)
550 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
552 static inline int agaw_to_level(int agaw)
557 static inline int agaw_to_width(int agaw)
559 return 30 + agaw * LEVEL_STRIDE;
563 static inline int width_to_agaw(int width)
565 return (width - 30) / LEVEL_STRIDE;
568 static inline unsigned int level_to_offset_bits(int level)
570 return (12 + (level - 1) * LEVEL_STRIDE);
573 static inline int address_level_offset(u64 addr, int level)
575 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
578 static inline u64 level_mask(int level)
580 return ((u64)-1 << level_to_offset_bits(level));
583 static inline u64 level_size(int level)
585 return ((u64)1 << level_to_offset_bits(level));
588 static inline u64 align_to_level(u64 addr, int level)
590 return ((addr + level_size(level) - 1) & level_mask(level));
593 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
595 int addr_width = agaw_to_width(domain->agaw);
596 struct dma_pte *parent, *pte = NULL;
597 int level = agaw_to_level(domain->agaw);
601 BUG_ON(!domain->pgd);
603 addr &= (((u64)1) << addr_width) - 1;
604 parent = domain->pgd;
606 spin_lock_irqsave(&domain->mapping_lock, flags);
610 offset = address_level_offset(addr, level);
611 pte = &parent[offset];
615 if (!dma_pte_present(pte)) {
616 tmp_page = alloc_pgtable_page();
619 spin_unlock_irqrestore(&domain->mapping_lock,
623 domain_flush_cache(domain, tmp_page, PAGE_SIZE);
624 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
626 * high level table always sets r/w, last level page
627 * table control read/write
629 dma_set_pte_readable(pte);
630 dma_set_pte_writable(pte);
631 domain_flush_cache(domain, pte, sizeof(*pte));
633 parent = phys_to_virt(dma_pte_addr(pte));
637 spin_unlock_irqrestore(&domain->mapping_lock, flags);
641 /* return address's pte at specific level */
642 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
645 struct dma_pte *parent, *pte = NULL;
646 int total = agaw_to_level(domain->agaw);
649 parent = domain->pgd;
650 while (level <= total) {
651 offset = address_level_offset(addr, total);
652 pte = &parent[offset];
656 if (!dma_pte_present(pte))
658 parent = phys_to_virt(dma_pte_addr(pte));
664 /* clear one page's page table */
665 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
667 struct dma_pte *pte = NULL;
669 /* get last level pte */
670 pte = dma_addr_level_pte(domain, addr, 1);
674 domain_flush_cache(domain, pte, sizeof(*pte));
678 /* clear last level pte, a tlb flush should be followed */
679 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
681 int addr_width = agaw_to_width(domain->agaw);
683 start &= (((u64)1) << addr_width) - 1;
684 end &= (((u64)1) << addr_width) - 1;
685 /* in case it's partial page */
686 start = PAGE_ALIGN(start);
689 /* we don't need lock here, nobody else touches the iova range */
690 while (start < end) {
691 dma_pte_clear_one(domain, start);
692 start += VTD_PAGE_SIZE;
696 /* free page table pages. last level pte should already be cleared */
697 static void dma_pte_free_pagetable(struct dmar_domain *domain,
700 int addr_width = agaw_to_width(domain->agaw);
702 int total = agaw_to_level(domain->agaw);
706 start &= (((u64)1) << addr_width) - 1;
707 end &= (((u64)1) << addr_width) - 1;
709 /* we don't need lock here, nobody else touches the iova range */
711 while (level <= total) {
712 tmp = align_to_level(start, level);
713 if (tmp >= end || (tmp + level_size(level) > end))
717 pte = dma_addr_level_pte(domain, tmp, level);
720 phys_to_virt(dma_pte_addr(pte)));
722 domain_flush_cache(domain, pte, sizeof(*pte));
724 tmp += level_size(level);
729 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
730 free_pgtable_page(domain->pgd);
736 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
738 struct root_entry *root;
741 root = (struct root_entry *)alloc_pgtable_page();
745 __iommu_flush_cache(iommu, root, ROOT_SIZE);
747 spin_lock_irqsave(&iommu->lock, flags);
748 iommu->root_entry = root;
749 spin_unlock_irqrestore(&iommu->lock, flags);
754 static void iommu_set_root_entry(struct intel_iommu *iommu)
760 addr = iommu->root_entry;
762 spin_lock_irqsave(&iommu->register_lock, flag);
763 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
765 cmd = iommu->gcmd | DMA_GCMD_SRTP;
766 writel(cmd, iommu->reg + DMAR_GCMD_REG);
768 /* Make sure hardware complete it */
769 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
770 readl, (sts & DMA_GSTS_RTPS), sts);
772 spin_unlock_irqrestore(&iommu->register_lock, flag);
775 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
780 if (!cap_rwbf(iommu->cap))
782 val = iommu->gcmd | DMA_GCMD_WBF;
784 spin_lock_irqsave(&iommu->register_lock, flag);
785 writel(val, iommu->reg + DMAR_GCMD_REG);
787 /* Make sure hardware complete it */
788 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
789 readl, (!(val & DMA_GSTS_WBFS)), val);
791 spin_unlock_irqrestore(&iommu->register_lock, flag);
794 /* return value determine if we need a write buffer flush */
795 static int __iommu_flush_context(struct intel_iommu *iommu,
796 u16 did, u16 source_id, u8 function_mask, u64 type,
797 int non_present_entry_flush)
803 * In the non-present entry flush case, if hardware doesn't cache
804 * non-present entry we do nothing and if hardware cache non-present
805 * entry, we flush entries of domain 0 (the domain id is used to cache
806 * any non-present entries)
808 if (non_present_entry_flush) {
809 if (!cap_caching_mode(iommu->cap))
816 case DMA_CCMD_GLOBAL_INVL:
817 val = DMA_CCMD_GLOBAL_INVL;
819 case DMA_CCMD_DOMAIN_INVL:
820 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
822 case DMA_CCMD_DEVICE_INVL:
823 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
824 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
831 spin_lock_irqsave(&iommu->register_lock, flag);
832 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
834 /* Make sure hardware complete it */
835 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
836 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
838 spin_unlock_irqrestore(&iommu->register_lock, flag);
840 /* flush context entry will implicitly flush write buffer */
844 /* return value determine if we need a write buffer flush */
845 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
846 u64 addr, unsigned int size_order, u64 type,
847 int non_present_entry_flush)
849 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
850 u64 val = 0, val_iva = 0;
854 * In the non-present entry flush case, if hardware doesn't cache
855 * non-present entry we do nothing and if hardware cache non-present
856 * entry, we flush entries of domain 0 (the domain id is used to cache
857 * any non-present entries)
859 if (non_present_entry_flush) {
860 if (!cap_caching_mode(iommu->cap))
867 case DMA_TLB_GLOBAL_FLUSH:
868 /* global flush doesn't need set IVA_REG */
869 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
871 case DMA_TLB_DSI_FLUSH:
872 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
874 case DMA_TLB_PSI_FLUSH:
875 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
876 /* Note: always flush non-leaf currently */
877 val_iva = size_order | addr;
882 /* Note: set drain read/write */
885 * This is probably to be super secure.. Looks like we can
886 * ignore it without any impact.
888 if (cap_read_drain(iommu->cap))
889 val |= DMA_TLB_READ_DRAIN;
891 if (cap_write_drain(iommu->cap))
892 val |= DMA_TLB_WRITE_DRAIN;
894 spin_lock_irqsave(&iommu->register_lock, flag);
895 /* Note: Only uses first TLB reg currently */
897 dmar_writeq(iommu->reg + tlb_offset, val_iva);
898 dmar_writeq(iommu->reg + tlb_offset + 8, val);
900 /* Make sure hardware complete it */
901 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
902 dmar_readq, (!(val & DMA_TLB_IVT)), val);
904 spin_unlock_irqrestore(&iommu->register_lock, flag);
906 /* check IOTLB invalidation granularity */
907 if (DMA_TLB_IAIG(val) == 0)
908 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
909 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
910 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
911 (unsigned long long)DMA_TLB_IIRG(type),
912 (unsigned long long)DMA_TLB_IAIG(val));
913 /* flush iotlb entry will implicitly flush write buffer */
917 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
918 u64 addr, unsigned int pages, int non_present_entry_flush)
922 BUG_ON(addr & (~VTD_PAGE_MASK));
925 /* Fallback to domain selective flush if no PSI support */
926 if (!cap_pgsel_inv(iommu->cap))
927 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
929 non_present_entry_flush);
932 * PSI requires page size to be 2 ^ x, and the base address is naturally
933 * aligned to the size
935 mask = ilog2(__roundup_pow_of_two(pages));
936 /* Fallback to domain selective flush if size is too big */
937 if (mask > cap_max_amask_val(iommu->cap))
938 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
939 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
941 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
943 non_present_entry_flush);
946 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
951 spin_lock_irqsave(&iommu->register_lock, flags);
952 pmen = readl(iommu->reg + DMAR_PMEN_REG);
953 pmen &= ~DMA_PMEN_EPM;
954 writel(pmen, iommu->reg + DMAR_PMEN_REG);
956 /* wait for the protected region status bit to clear */
957 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
958 readl, !(pmen & DMA_PMEN_PRS), pmen);
960 spin_unlock_irqrestore(&iommu->register_lock, flags);
963 static int iommu_enable_translation(struct intel_iommu *iommu)
968 spin_lock_irqsave(&iommu->register_lock, flags);
969 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
971 /* Make sure hardware complete it */
972 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
973 readl, (sts & DMA_GSTS_TES), sts);
975 iommu->gcmd |= DMA_GCMD_TE;
976 spin_unlock_irqrestore(&iommu->register_lock, flags);
980 static int iommu_disable_translation(struct intel_iommu *iommu)
985 spin_lock_irqsave(&iommu->register_lock, flag);
986 iommu->gcmd &= ~DMA_GCMD_TE;
987 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
989 /* Make sure hardware complete it */
990 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
991 readl, (!(sts & DMA_GSTS_TES)), sts);
993 spin_unlock_irqrestore(&iommu->register_lock, flag);
997 /* iommu interrupt handling. Most stuff are MSI-like. */
999 static const char *fault_reason_strings[] =
1002 "Present bit in root entry is clear",
1003 "Present bit in context entry is clear",
1004 "Invalid context entry",
1005 "Access beyond MGAW",
1006 "PTE Write access is not set",
1007 "PTE Read access is not set",
1008 "Next page table ptr is invalid",
1009 "Root table address invalid",
1010 "Context table ptr is invalid",
1011 "non-zero reserved fields in RTP",
1012 "non-zero reserved fields in CTP",
1013 "non-zero reserved fields in PTE",
1015 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1017 const char *dmar_get_fault_reason(u8 fault_reason)
1019 if (fault_reason > MAX_FAULT_REASON_IDX)
1022 return fault_reason_strings[fault_reason];
1025 void dmar_msi_unmask(unsigned int irq)
1027 struct intel_iommu *iommu = get_irq_data(irq);
1031 spin_lock_irqsave(&iommu->register_lock, flag);
1032 writel(0, iommu->reg + DMAR_FECTL_REG);
1033 /* Read a reg to force flush the post write */
1034 readl(iommu->reg + DMAR_FECTL_REG);
1035 spin_unlock_irqrestore(&iommu->register_lock, flag);
1038 void dmar_msi_mask(unsigned int irq)
1041 struct intel_iommu *iommu = get_irq_data(irq);
1044 spin_lock_irqsave(&iommu->register_lock, flag);
1045 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1046 /* Read a reg to force flush the post write */
1047 readl(iommu->reg + DMAR_FECTL_REG);
1048 spin_unlock_irqrestore(&iommu->register_lock, flag);
1051 void dmar_msi_write(int irq, struct msi_msg *msg)
1053 struct intel_iommu *iommu = get_irq_data(irq);
1056 spin_lock_irqsave(&iommu->register_lock, flag);
1057 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1058 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1059 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1060 spin_unlock_irqrestore(&iommu->register_lock, flag);
1063 void dmar_msi_read(int irq, struct msi_msg *msg)
1065 struct intel_iommu *iommu = get_irq_data(irq);
1068 spin_lock_irqsave(&iommu->register_lock, flag);
1069 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1070 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1071 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1072 spin_unlock_irqrestore(&iommu->register_lock, flag);
1075 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1076 u8 fault_reason, u16 source_id, unsigned long long addr)
1080 reason = dmar_get_fault_reason(fault_reason);
1083 "DMAR:[%s] Request device [%02x:%02x.%d] "
1084 "fault addr %llx \n"
1085 "DMAR:[fault reason %02d] %s\n",
1086 (type ? "DMA Read" : "DMA Write"),
1087 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1088 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1092 #define PRIMARY_FAULT_REG_LEN (16)
1093 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1095 struct intel_iommu *iommu = dev_id;
1096 int reg, fault_index;
1100 spin_lock_irqsave(&iommu->register_lock, flag);
1101 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1103 /* TBD: ignore advanced fault log currently */
1104 if (!(fault_status & DMA_FSTS_PPF))
1105 goto clear_overflow;
1107 fault_index = dma_fsts_fault_record_index(fault_status);
1108 reg = cap_fault_reg_offset(iommu->cap);
1116 /* highest 32 bits */
1117 data = readl(iommu->reg + reg +
1118 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1119 if (!(data & DMA_FRCD_F))
1122 fault_reason = dma_frcd_fault_reason(data);
1123 type = dma_frcd_type(data);
1125 data = readl(iommu->reg + reg +
1126 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1127 source_id = dma_frcd_source_id(data);
1129 guest_addr = dmar_readq(iommu->reg + reg +
1130 fault_index * PRIMARY_FAULT_REG_LEN);
1131 guest_addr = dma_frcd_page_addr(guest_addr);
1132 /* clear the fault */
1133 writel(DMA_FRCD_F, iommu->reg + reg +
1134 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1136 spin_unlock_irqrestore(&iommu->register_lock, flag);
1138 iommu_page_fault_do_one(iommu, type, fault_reason,
1139 source_id, guest_addr);
1142 if (fault_index > cap_num_fault_regs(iommu->cap))
1144 spin_lock_irqsave(&iommu->register_lock, flag);
1147 /* clear primary fault overflow */
1148 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1149 if (fault_status & DMA_FSTS_PFO)
1150 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1152 spin_unlock_irqrestore(&iommu->register_lock, flag);
1156 int dmar_set_interrupt(struct intel_iommu *iommu)
1162 printk(KERN_ERR "IOMMU: no free vectors\n");
1166 set_irq_data(irq, iommu);
1169 ret = arch_setup_dmar_msi(irq);
1171 set_irq_data(irq, NULL);
1177 /* Force fault register is cleared */
1178 iommu_page_fault(irq, iommu);
1180 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1182 printk(KERN_ERR "IOMMU: can't request irq\n");
1186 static int iommu_init_domains(struct intel_iommu *iommu)
1188 unsigned long ndomains;
1189 unsigned long nlongs;
1191 ndomains = cap_ndoms(iommu->cap);
1192 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1193 nlongs = BITS_TO_LONGS(ndomains);
1195 /* TBD: there might be 64K domains,
1196 * consider other allocation for future chip
1198 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1199 if (!iommu->domain_ids) {
1200 printk(KERN_ERR "Allocating domain id array failed\n");
1203 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1205 if (!iommu->domains) {
1206 printk(KERN_ERR "Allocating domain array failed\n");
1207 kfree(iommu->domain_ids);
1211 spin_lock_init(&iommu->lock);
1214 * if Caching mode is set, then invalid translations are tagged
1215 * with domainid 0. Hence we need to pre-allocate it.
1217 if (cap_caching_mode(iommu->cap))
1218 set_bit(0, iommu->domain_ids);
1223 static void domain_exit(struct dmar_domain *domain);
1224 static void vm_domain_exit(struct dmar_domain *domain);
1226 void free_dmar_iommu(struct intel_iommu *iommu)
1228 struct dmar_domain *domain;
1230 unsigned long flags;
1232 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1233 for (; i < cap_ndoms(iommu->cap); ) {
1234 domain = iommu->domains[i];
1235 clear_bit(i, iommu->domain_ids);
1237 spin_lock_irqsave(&domain->iommu_lock, flags);
1238 if (--domain->iommu_count == 0) {
1239 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1240 vm_domain_exit(domain);
1242 domain_exit(domain);
1244 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1246 i = find_next_bit(iommu->domain_ids,
1247 cap_ndoms(iommu->cap), i+1);
1250 if (iommu->gcmd & DMA_GCMD_TE)
1251 iommu_disable_translation(iommu);
1254 set_irq_data(iommu->irq, NULL);
1255 /* This will mask the irq */
1256 free_irq(iommu->irq, iommu);
1257 destroy_irq(iommu->irq);
1260 kfree(iommu->domains);
1261 kfree(iommu->domain_ids);
1263 g_iommus[iommu->seq_id] = NULL;
1265 /* if all iommus are freed, free g_iommus */
1266 for (i = 0; i < g_num_of_iommus; i++) {
1271 if (i == g_num_of_iommus)
1274 /* free context mapping */
1275 free_context_table(iommu);
1278 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1281 unsigned long ndomains;
1282 struct dmar_domain *domain;
1283 unsigned long flags;
1285 domain = alloc_domain_mem();
1289 ndomains = cap_ndoms(iommu->cap);
1291 spin_lock_irqsave(&iommu->lock, flags);
1292 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1293 if (num >= ndomains) {
1294 spin_unlock_irqrestore(&iommu->lock, flags);
1295 free_domain_mem(domain);
1296 printk(KERN_ERR "IOMMU: no free domain ids\n");
1300 set_bit(num, iommu->domain_ids);
1302 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1303 set_bit(iommu->seq_id, &domain->iommu_bmp);
1305 iommu->domains[num] = domain;
1306 spin_unlock_irqrestore(&iommu->lock, flags);
1311 static void iommu_free_domain(struct dmar_domain *domain)
1313 unsigned long flags;
1314 struct intel_iommu *iommu;
1316 iommu = domain_get_iommu(domain);
1318 spin_lock_irqsave(&iommu->lock, flags);
1319 clear_bit(domain->id, iommu->domain_ids);
1320 spin_unlock_irqrestore(&iommu->lock, flags);
1323 static struct iova_domain reserved_iova_list;
1324 static struct lock_class_key reserved_alloc_key;
1325 static struct lock_class_key reserved_rbtree_key;
1327 static void dmar_init_reserved_ranges(void)
1329 struct pci_dev *pdev = NULL;
1334 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1336 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1337 &reserved_alloc_key);
1338 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1339 &reserved_rbtree_key);
1341 /* IOAPIC ranges shouldn't be accessed by DMA */
1342 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1343 IOVA_PFN(IOAPIC_RANGE_END));
1345 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1347 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1348 for_each_pci_dev(pdev) {
1351 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1352 r = &pdev->resource[i];
1353 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1357 size = r->end - addr;
1358 size = PAGE_ALIGN(size);
1359 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1360 IOVA_PFN(size + addr) - 1);
1362 printk(KERN_ERR "Reserve iova failed\n");
1368 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1370 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1373 static inline int guestwidth_to_adjustwidth(int gaw)
1376 int r = (gaw - 12) % 9;
1387 static int domain_init(struct dmar_domain *domain, int guest_width)
1389 struct intel_iommu *iommu;
1390 int adjust_width, agaw;
1391 unsigned long sagaw;
1393 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1394 spin_lock_init(&domain->mapping_lock);
1395 spin_lock_init(&domain->iommu_lock);
1397 domain_reserve_special_ranges(domain);
1399 /* calculate AGAW */
1400 iommu = domain_get_iommu(domain);
1401 if (guest_width > cap_mgaw(iommu->cap))
1402 guest_width = cap_mgaw(iommu->cap);
1403 domain->gaw = guest_width;
1404 adjust_width = guestwidth_to_adjustwidth(guest_width);
1405 agaw = width_to_agaw(adjust_width);
1406 sagaw = cap_sagaw(iommu->cap);
1407 if (!test_bit(agaw, &sagaw)) {
1408 /* hardware doesn't support it, choose a bigger one */
1409 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1410 agaw = find_next_bit(&sagaw, 5, agaw);
1414 domain->agaw = agaw;
1415 INIT_LIST_HEAD(&domain->devices);
1417 if (ecap_coherent(iommu->ecap))
1418 domain->iommu_coherency = 1;
1420 domain->iommu_coherency = 0;
1422 domain->iommu_count = 1;
1424 /* always allocate the top pgd */
1425 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1428 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1432 static void domain_exit(struct dmar_domain *domain)
1436 /* Domain 0 is reserved, so dont process it */
1440 domain_remove_dev_info(domain);
1442 put_iova_domain(&domain->iovad);
1443 end = DOMAIN_MAX_ADDR(domain->gaw);
1444 end = end & (~PAGE_MASK);
1447 dma_pte_clear_range(domain, 0, end);
1449 /* free page tables */
1450 dma_pte_free_pagetable(domain, 0, end);
1452 iommu_free_domain(domain);
1453 free_domain_mem(domain);
1456 static int domain_context_mapping_one(struct dmar_domain *domain,
1459 struct context_entry *context;
1460 unsigned long flags;
1461 struct intel_iommu *iommu;
1462 struct dma_pte *pgd;
1464 unsigned long ndomains;
1468 pr_debug("Set context mapping for %02x:%02x.%d\n",
1469 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1470 BUG_ON(!domain->pgd);
1472 iommu = device_to_iommu(bus, devfn);
1476 context = device_to_context_entry(iommu, bus, devfn);
1479 spin_lock_irqsave(&iommu->lock, flags);
1480 if (context_present(context)) {
1481 spin_unlock_irqrestore(&iommu->lock, flags);
1488 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1491 /* find an available domain id for this device in iommu */
1492 ndomains = cap_ndoms(iommu->cap);
1493 num = find_first_bit(iommu->domain_ids, ndomains);
1494 for (; num < ndomains; ) {
1495 if (iommu->domains[num] == domain) {
1500 num = find_next_bit(iommu->domain_ids,
1501 cap_ndoms(iommu->cap), num+1);
1505 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1506 if (num >= ndomains) {
1507 spin_unlock_irqrestore(&iommu->lock, flags);
1508 printk(KERN_ERR "IOMMU: no free domain ids\n");
1512 set_bit(num, iommu->domain_ids);
1513 iommu->domains[num] = domain;
1517 /* Skip top levels of page tables for
1518 * iommu which has less agaw than default.
1520 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1521 pgd = phys_to_virt(dma_pte_addr(pgd));
1522 if (!dma_pte_present(pgd)) {
1523 spin_unlock_irqrestore(&iommu->lock, flags);
1529 context_set_domain_id(context, id);
1530 context_set_address_width(context, iommu->agaw);
1531 context_set_address_root(context, virt_to_phys(pgd));
1532 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1533 context_set_fault_enable(context);
1534 context_set_present(context);
1535 domain_flush_cache(domain, context, sizeof(*context));
1537 /* it's a non-present to present mapping */
1538 if (iommu->flush.flush_context(iommu, domain->id,
1539 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1540 DMA_CCMD_DEVICE_INVL, 1))
1541 iommu_flush_write_buffer(iommu);
1543 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1545 spin_unlock_irqrestore(&iommu->lock, flags);
1547 spin_lock_irqsave(&domain->iommu_lock, flags);
1548 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1549 domain->iommu_count++;
1550 domain_update_iommu_coherency(domain);
1552 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1557 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1560 struct pci_dev *tmp, *parent;
1562 ret = domain_context_mapping_one(domain, pdev->bus->number,
1567 /* dependent device mapping */
1568 tmp = pci_find_upstream_pcie_bridge(pdev);
1571 /* Secondary interface's bus number and devfn 0 */
1572 parent = pdev->bus->self;
1573 while (parent != tmp) {
1574 ret = domain_context_mapping_one(domain, parent->bus->number,
1578 parent = parent->bus->self;
1580 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1581 return domain_context_mapping_one(domain,
1582 tmp->subordinate->number, 0);
1583 else /* this is a legacy PCI bridge */
1584 return domain_context_mapping_one(domain,
1585 tmp->bus->number, tmp->devfn);
1588 static int domain_context_mapped(struct pci_dev *pdev)
1591 struct pci_dev *tmp, *parent;
1592 struct intel_iommu *iommu;
1594 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1598 ret = device_context_mapped(iommu,
1599 pdev->bus->number, pdev->devfn);
1602 /* dependent device mapping */
1603 tmp = pci_find_upstream_pcie_bridge(pdev);
1606 /* Secondary interface's bus number and devfn 0 */
1607 parent = pdev->bus->self;
1608 while (parent != tmp) {
1609 ret = device_context_mapped(iommu, parent->bus->number,
1613 parent = parent->bus->self;
1616 return device_context_mapped(iommu,
1617 tmp->subordinate->number, 0);
1619 return device_context_mapped(iommu,
1620 tmp->bus->number, tmp->devfn);
1624 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1625 u64 hpa, size_t size, int prot)
1627 u64 start_pfn, end_pfn;
1628 struct dma_pte *pte;
1630 int addr_width = agaw_to_width(domain->agaw);
1632 hpa &= (((u64)1) << addr_width) - 1;
1634 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1637 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1638 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1640 while (start_pfn < end_pfn) {
1641 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1644 /* We don't need lock here, nobody else
1645 * touches the iova range
1647 BUG_ON(dma_pte_addr(pte));
1648 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1649 dma_set_pte_prot(pte, prot);
1650 domain_flush_cache(domain, pte, sizeof(*pte));
1657 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1662 clear_context_table(iommu, bus, devfn);
1663 iommu->flush.flush_context(iommu, 0, 0, 0,
1664 DMA_CCMD_GLOBAL_INVL, 0);
1665 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1666 DMA_TLB_GLOBAL_FLUSH, 0);
1669 static void domain_remove_dev_info(struct dmar_domain *domain)
1671 struct device_domain_info *info;
1672 unsigned long flags;
1673 struct intel_iommu *iommu;
1675 spin_lock_irqsave(&device_domain_lock, flags);
1676 while (!list_empty(&domain->devices)) {
1677 info = list_entry(domain->devices.next,
1678 struct device_domain_info, link);
1679 list_del(&info->link);
1680 list_del(&info->global);
1682 info->dev->dev.archdata.iommu = NULL;
1683 spin_unlock_irqrestore(&device_domain_lock, flags);
1685 iommu = device_to_iommu(info->bus, info->devfn);
1686 iommu_detach_dev(iommu, info->bus, info->devfn);
1687 free_devinfo_mem(info);
1689 spin_lock_irqsave(&device_domain_lock, flags);
1691 spin_unlock_irqrestore(&device_domain_lock, flags);
1696 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1698 static struct dmar_domain *
1699 find_domain(struct pci_dev *pdev)
1701 struct device_domain_info *info;
1703 /* No lock here, assumes no domain exit in normal case */
1704 info = pdev->dev.archdata.iommu;
1706 return info->domain;
1710 /* domain is initialized */
1711 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1713 struct dmar_domain *domain, *found = NULL;
1714 struct intel_iommu *iommu;
1715 struct dmar_drhd_unit *drhd;
1716 struct device_domain_info *info, *tmp;
1717 struct pci_dev *dev_tmp;
1718 unsigned long flags;
1719 int bus = 0, devfn = 0;
1721 domain = find_domain(pdev);
1725 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1727 if (dev_tmp->is_pcie) {
1728 bus = dev_tmp->subordinate->number;
1731 bus = dev_tmp->bus->number;
1732 devfn = dev_tmp->devfn;
1734 spin_lock_irqsave(&device_domain_lock, flags);
1735 list_for_each_entry(info, &device_domain_list, global) {
1736 if (info->bus == bus && info->devfn == devfn) {
1737 found = info->domain;
1741 spin_unlock_irqrestore(&device_domain_lock, flags);
1742 /* pcie-pci bridge already has a domain, uses it */
1749 /* Allocate new domain for the device */
1750 drhd = dmar_find_matched_drhd_unit(pdev);
1752 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1756 iommu = drhd->iommu;
1758 domain = iommu_alloc_domain(iommu);
1762 if (domain_init(domain, gaw)) {
1763 domain_exit(domain);
1767 /* register pcie-to-pci device */
1769 info = alloc_devinfo_mem();
1771 domain_exit(domain);
1775 info->devfn = devfn;
1777 info->domain = domain;
1778 /* This domain is shared by devices under p2p bridge */
1779 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1781 /* pcie-to-pci bridge already has a domain, uses it */
1783 spin_lock_irqsave(&device_domain_lock, flags);
1784 list_for_each_entry(tmp, &device_domain_list, global) {
1785 if (tmp->bus == bus && tmp->devfn == devfn) {
1786 found = tmp->domain;
1791 free_devinfo_mem(info);
1792 domain_exit(domain);
1795 list_add(&info->link, &domain->devices);
1796 list_add(&info->global, &device_domain_list);
1798 spin_unlock_irqrestore(&device_domain_lock, flags);
1802 info = alloc_devinfo_mem();
1805 info->bus = pdev->bus->number;
1806 info->devfn = pdev->devfn;
1808 info->domain = domain;
1809 spin_lock_irqsave(&device_domain_lock, flags);
1810 /* somebody is fast */
1811 found = find_domain(pdev);
1812 if (found != NULL) {
1813 spin_unlock_irqrestore(&device_domain_lock, flags);
1814 if (found != domain) {
1815 domain_exit(domain);
1818 free_devinfo_mem(info);
1821 list_add(&info->link, &domain->devices);
1822 list_add(&info->global, &device_domain_list);
1823 pdev->dev.archdata.iommu = info;
1824 spin_unlock_irqrestore(&device_domain_lock, flags);
1827 /* recheck it here, maybe others set it */
1828 return find_domain(pdev);
1831 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1832 unsigned long long start,
1833 unsigned long long end)
1835 struct dmar_domain *domain;
1837 unsigned long long base;
1841 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1842 pci_name(pdev), start, end);
1843 /* page table init */
1844 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1848 /* The address might not be aligned */
1849 base = start & PAGE_MASK;
1851 size = PAGE_ALIGN(size);
1852 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1853 IOVA_PFN(base + size) - 1)) {
1854 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1859 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1860 size, base, pci_name(pdev));
1862 * RMRR range might have overlap with physical memory range,
1865 dma_pte_clear_range(domain, base, base + size);
1867 ret = domain_page_mapping(domain, base, base, size,
1868 DMA_PTE_READ|DMA_PTE_WRITE);
1872 /* context entry init */
1873 ret = domain_context_mapping(domain, pdev);
1877 domain_exit(domain);
1882 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1883 struct pci_dev *pdev)
1885 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1887 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1888 rmrr->end_address + 1);
1891 #ifdef CONFIG_DMAR_GFX_WA
1892 struct iommu_prepare_data {
1893 struct pci_dev *pdev;
1897 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1898 unsigned long end_pfn, void *datax)
1900 struct iommu_prepare_data *data;
1902 data = (struct iommu_prepare_data *)datax;
1904 data->ret = iommu_prepare_identity_map(data->pdev,
1905 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1910 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1913 struct iommu_prepare_data data;
1918 for_each_online_node(nid) {
1919 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1926 static void __init iommu_prepare_gfx_mapping(void)
1928 struct pci_dev *pdev = NULL;
1931 for_each_pci_dev(pdev) {
1932 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1933 !IS_GFX_DEVICE(pdev))
1935 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1937 ret = iommu_prepare_with_active_regions(pdev);
1939 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1942 #else /* !CONFIG_DMAR_GFX_WA */
1943 static inline void iommu_prepare_gfx_mapping(void)
1949 #ifdef CONFIG_DMAR_FLOPPY_WA
1950 static inline void iommu_prepare_isa(void)
1952 struct pci_dev *pdev;
1955 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1959 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1960 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1963 printk("IOMMU: Failed to create 0-64M identity map, "
1964 "floppy might not work\n");
1968 static inline void iommu_prepare_isa(void)
1972 #endif /* !CONFIG_DMAR_FLPY_WA */
1974 static int __init init_dmars(void)
1976 struct dmar_drhd_unit *drhd;
1977 struct dmar_rmrr_unit *rmrr;
1978 struct pci_dev *pdev;
1979 struct intel_iommu *iommu;
1980 int i, ret, unit = 0;
1985 * initialize and program root entry to not present
1988 for_each_drhd_unit(drhd) {
1991 * lock not needed as this is only incremented in the single
1992 * threaded kernel __init code path all other access are read
1997 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2000 printk(KERN_ERR "Allocating global iommu array failed\n");
2005 deferred_flush = kzalloc(g_num_of_iommus *
2006 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2007 if (!deferred_flush) {
2013 for_each_drhd_unit(drhd) {
2017 iommu = drhd->iommu;
2018 g_iommus[iommu->seq_id] = iommu;
2020 ret = iommu_init_domains(iommu);
2026 * we could share the same root & context tables
2027 * amoung all IOMMU's. Need to Split it later.
2029 ret = iommu_alloc_root_entry(iommu);
2031 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2036 for_each_drhd_unit(drhd) {
2040 iommu = drhd->iommu;
2041 if (dmar_enable_qi(iommu)) {
2043 * Queued Invalidate not enabled, use Register Based
2046 iommu->flush.flush_context = __iommu_flush_context;
2047 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2048 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2050 (unsigned long long)drhd->reg_base_addr);
2052 iommu->flush.flush_context = qi_flush_context;
2053 iommu->flush.flush_iotlb = qi_flush_iotlb;
2054 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2056 (unsigned long long)drhd->reg_base_addr);
2062 * for each dev attached to rmrr
2064 * locate drhd for dev, alloc domain for dev
2065 * allocate free domain
2066 * allocate page table entries for rmrr
2067 * if context not allocated for bus
2068 * allocate and init context
2069 * set present in root table for this bus
2070 * init context with domain, translation etc
2074 for_each_rmrr_units(rmrr) {
2075 for (i = 0; i < rmrr->devices_cnt; i++) {
2076 pdev = rmrr->devices[i];
2077 /* some BIOS lists non-exist devices in DMAR table */
2080 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2083 "IOMMU: mapping reserved region failed\n");
2087 iommu_prepare_gfx_mapping();
2089 iommu_prepare_isa();
2094 * global invalidate context cache
2095 * global invalidate iotlb
2096 * enable translation
2098 for_each_drhd_unit(drhd) {
2101 iommu = drhd->iommu;
2102 sprintf (iommu->name, "dmar%d", unit++);
2104 iommu_flush_write_buffer(iommu);
2106 ret = dmar_set_interrupt(iommu);
2110 iommu_set_root_entry(iommu);
2112 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2114 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2116 iommu_disable_protect_mem_regions(iommu);
2118 ret = iommu_enable_translation(iommu);
2125 for_each_drhd_unit(drhd) {
2128 iommu = drhd->iommu;
2135 static inline u64 aligned_size(u64 host_addr, size_t size)
2138 addr = (host_addr & (~PAGE_MASK)) + size;
2139 return PAGE_ALIGN(addr);
2143 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2147 /* Make sure it's in range */
2148 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2149 if (!size || (IOVA_START_ADDR + size > end))
2152 piova = alloc_iova(&domain->iovad,
2153 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2157 static struct iova *
2158 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2159 size_t size, u64 dma_mask)
2161 struct pci_dev *pdev = to_pci_dev(dev);
2162 struct iova *iova = NULL;
2164 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2165 iova = iommu_alloc_iova(domain, size, dma_mask);
2168 * First try to allocate an io virtual address in
2169 * DMA_32BIT_MASK and if that fails then try allocating
2172 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2174 iova = iommu_alloc_iova(domain, size, dma_mask);
2178 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2185 static struct dmar_domain *
2186 get_valid_domain_for_dev(struct pci_dev *pdev)
2188 struct dmar_domain *domain;
2191 domain = get_domain_for_dev(pdev,
2192 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2195 "Allocating domain for %s failed", pci_name(pdev));
2199 /* make sure context mapping is ok */
2200 if (unlikely(!domain_context_mapped(pdev))) {
2201 ret = domain_context_mapping(domain, pdev);
2204 "Domain context map for %s failed",
2213 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2214 size_t size, int dir, u64 dma_mask)
2216 struct pci_dev *pdev = to_pci_dev(hwdev);
2217 struct dmar_domain *domain;
2218 phys_addr_t start_paddr;
2222 struct intel_iommu *iommu;
2224 BUG_ON(dir == DMA_NONE);
2225 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2228 domain = get_valid_domain_for_dev(pdev);
2232 iommu = domain_get_iommu(domain);
2233 size = aligned_size((u64)paddr, size);
2235 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2239 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2242 * Check if DMAR supports zero-length reads on write only
2245 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2246 !cap_zlr(iommu->cap))
2247 prot |= DMA_PTE_READ;
2248 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2249 prot |= DMA_PTE_WRITE;
2251 * paddr - (paddr + size) might be partial page, we should map the whole
2252 * page. Note: if two part of one page are separately mapped, we
2253 * might have two guest_addr mapping to the same host paddr, but this
2254 * is not a big problem
2256 ret = domain_page_mapping(domain, start_paddr,
2257 ((u64)paddr) & PAGE_MASK, size, prot);
2261 /* it's a non-present to present mapping */
2262 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2263 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2265 iommu_flush_write_buffer(iommu);
2267 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2271 __free_iova(&domain->iovad, iova);
2272 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2273 pci_name(pdev), size, (unsigned long long)paddr, dir);
2277 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2278 unsigned long offset, size_t size,
2279 enum dma_data_direction dir,
2280 struct dma_attrs *attrs)
2282 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2283 dir, to_pci_dev(dev)->dma_mask);
2286 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2287 size_t size, int dir)
2289 return __intel_map_single(hwdev, paddr, size, dir,
2290 to_pci_dev(hwdev)->dma_mask);
2293 static void flush_unmaps(void)
2299 /* just flush them all */
2300 for (i = 0; i < g_num_of_iommus; i++) {
2301 struct intel_iommu *iommu = g_iommus[i];
2305 if (deferred_flush[i].next) {
2306 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2307 DMA_TLB_GLOBAL_FLUSH, 0);
2308 for (j = 0; j < deferred_flush[i].next; j++) {
2309 __free_iova(&deferred_flush[i].domain[j]->iovad,
2310 deferred_flush[i].iova[j]);
2312 deferred_flush[i].next = 0;
2319 static void flush_unmaps_timeout(unsigned long data)
2321 unsigned long flags;
2323 spin_lock_irqsave(&async_umap_flush_lock, flags);
2325 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2328 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2330 unsigned long flags;
2332 struct intel_iommu *iommu;
2334 spin_lock_irqsave(&async_umap_flush_lock, flags);
2335 if (list_size == HIGH_WATER_MARK)
2338 iommu = domain_get_iommu(dom);
2339 iommu_id = iommu->seq_id;
2341 next = deferred_flush[iommu_id].next;
2342 deferred_flush[iommu_id].domain[next] = dom;
2343 deferred_flush[iommu_id].iova[next] = iova;
2344 deferred_flush[iommu_id].next++;
2347 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2351 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2354 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2355 size_t size, enum dma_data_direction dir,
2356 struct dma_attrs *attrs)
2358 struct pci_dev *pdev = to_pci_dev(dev);
2359 struct dmar_domain *domain;
2360 unsigned long start_addr;
2362 struct intel_iommu *iommu;
2364 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2366 domain = find_domain(pdev);
2369 iommu = domain_get_iommu(domain);
2371 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2375 start_addr = iova->pfn_lo << PAGE_SHIFT;
2376 size = aligned_size((u64)dev_addr, size);
2378 pr_debug("Device %s unmapping: %lx@%llx\n",
2379 pci_name(pdev), size, (unsigned long long)start_addr);
2381 /* clear the whole page */
2382 dma_pte_clear_range(domain, start_addr, start_addr + size);
2383 /* free page tables */
2384 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2385 if (intel_iommu_strict) {
2386 if (iommu_flush_iotlb_psi(iommu,
2387 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2388 iommu_flush_write_buffer(iommu);
2390 __free_iova(&domain->iovad, iova);
2392 add_unmap(domain, iova);
2394 * queue up the release of the unmap to save the 1/6th of the
2395 * cpu used up by the iotlb flush operation...
2400 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2403 intel_unmap_page(dev, dev_addr, size, dir, NULL);
2406 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2407 dma_addr_t *dma_handle, gfp_t flags)
2412 size = PAGE_ALIGN(size);
2413 order = get_order(size);
2414 flags &= ~(GFP_DMA | GFP_DMA32);
2416 vaddr = (void *)__get_free_pages(flags, order);
2419 memset(vaddr, 0, size);
2421 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2423 hwdev->coherent_dma_mask);
2426 free_pages((unsigned long)vaddr, order);
2430 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2431 dma_addr_t dma_handle)
2435 size = PAGE_ALIGN(size);
2436 order = get_order(size);
2438 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2439 free_pages((unsigned long)vaddr, order);
2442 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2444 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2445 int nelems, enum dma_data_direction dir,
2446 struct dma_attrs *attrs)
2449 struct pci_dev *pdev = to_pci_dev(hwdev);
2450 struct dmar_domain *domain;
2451 unsigned long start_addr;
2455 struct scatterlist *sg;
2456 struct intel_iommu *iommu;
2458 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2461 domain = find_domain(pdev);
2464 iommu = domain_get_iommu(domain);
2466 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2469 for_each_sg(sglist, sg, nelems, i) {
2470 addr = SG_ENT_VIRT_ADDRESS(sg);
2471 size += aligned_size((u64)addr, sg->length);
2474 start_addr = iova->pfn_lo << PAGE_SHIFT;
2476 /* clear the whole page */
2477 dma_pte_clear_range(domain, start_addr, start_addr + size);
2478 /* free page tables */
2479 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2481 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2482 size >> VTD_PAGE_SHIFT, 0))
2483 iommu_flush_write_buffer(iommu);
2486 __free_iova(&domain->iovad, iova);
2489 static int intel_nontranslate_map_sg(struct device *hddev,
2490 struct scatterlist *sglist, int nelems, int dir)
2493 struct scatterlist *sg;
2495 for_each_sg(sglist, sg, nelems, i) {
2496 BUG_ON(!sg_page(sg));
2497 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2498 sg->dma_length = sg->length;
2503 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2504 enum dma_data_direction dir, struct dma_attrs *attrs)
2508 struct pci_dev *pdev = to_pci_dev(hwdev);
2509 struct dmar_domain *domain;
2513 struct iova *iova = NULL;
2515 struct scatterlist *sg;
2516 unsigned long start_addr;
2517 struct intel_iommu *iommu;
2519 BUG_ON(dir == DMA_NONE);
2520 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2521 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2523 domain = get_valid_domain_for_dev(pdev);
2527 iommu = domain_get_iommu(domain);
2529 for_each_sg(sglist, sg, nelems, i) {
2530 addr = SG_ENT_VIRT_ADDRESS(sg);
2531 addr = (void *)virt_to_phys(addr);
2532 size += aligned_size((u64)addr, sg->length);
2535 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2537 sglist->dma_length = 0;
2542 * Check if DMAR supports zero-length reads on write only
2545 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2546 !cap_zlr(iommu->cap))
2547 prot |= DMA_PTE_READ;
2548 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2549 prot |= DMA_PTE_WRITE;
2551 start_addr = iova->pfn_lo << PAGE_SHIFT;
2553 for_each_sg(sglist, sg, nelems, i) {
2554 addr = SG_ENT_VIRT_ADDRESS(sg);
2555 addr = (void *)virt_to_phys(addr);
2556 size = aligned_size((u64)addr, sg->length);
2557 ret = domain_page_mapping(domain, start_addr + offset,
2558 ((u64)addr) & PAGE_MASK,
2561 /* clear the page */
2562 dma_pte_clear_range(domain, start_addr,
2563 start_addr + offset);
2564 /* free page tables */
2565 dma_pte_free_pagetable(domain, start_addr,
2566 start_addr + offset);
2568 __free_iova(&domain->iovad, iova);
2571 sg->dma_address = start_addr + offset +
2572 ((u64)addr & (~PAGE_MASK));
2573 sg->dma_length = sg->length;
2577 /* it's a non-present to present mapping */
2578 if (iommu_flush_iotlb_psi(iommu, domain->id,
2579 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2580 iommu_flush_write_buffer(iommu);
2584 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2589 struct dma_map_ops intel_dma_ops = {
2590 .alloc_coherent = intel_alloc_coherent,
2591 .free_coherent = intel_free_coherent,
2592 .map_sg = intel_map_sg,
2593 .unmap_sg = intel_unmap_sg,
2594 .map_page = intel_map_page,
2595 .unmap_page = intel_unmap_page,
2596 .mapping_error = intel_mapping_error,
2599 static inline int iommu_domain_cache_init(void)
2603 iommu_domain_cache = kmem_cache_create("iommu_domain",
2604 sizeof(struct dmar_domain),
2609 if (!iommu_domain_cache) {
2610 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2617 static inline int iommu_devinfo_cache_init(void)
2621 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2622 sizeof(struct device_domain_info),
2626 if (!iommu_devinfo_cache) {
2627 printk(KERN_ERR "Couldn't create devinfo cache\n");
2634 static inline int iommu_iova_cache_init(void)
2638 iommu_iova_cache = kmem_cache_create("iommu_iova",
2639 sizeof(struct iova),
2643 if (!iommu_iova_cache) {
2644 printk(KERN_ERR "Couldn't create iova cache\n");
2651 static int __init iommu_init_mempool(void)
2654 ret = iommu_iova_cache_init();
2658 ret = iommu_domain_cache_init();
2662 ret = iommu_devinfo_cache_init();
2666 kmem_cache_destroy(iommu_domain_cache);
2668 kmem_cache_destroy(iommu_iova_cache);
2673 static void __init iommu_exit_mempool(void)
2675 kmem_cache_destroy(iommu_devinfo_cache);
2676 kmem_cache_destroy(iommu_domain_cache);
2677 kmem_cache_destroy(iommu_iova_cache);
2681 static void __init init_no_remapping_devices(void)
2683 struct dmar_drhd_unit *drhd;
2685 for_each_drhd_unit(drhd) {
2686 if (!drhd->include_all) {
2688 for (i = 0; i < drhd->devices_cnt; i++)
2689 if (drhd->devices[i] != NULL)
2691 /* ignore DMAR unit if no pci devices exist */
2692 if (i == drhd->devices_cnt)
2700 for_each_drhd_unit(drhd) {
2702 if (drhd->ignored || drhd->include_all)
2705 for (i = 0; i < drhd->devices_cnt; i++)
2706 if (drhd->devices[i] &&
2707 !IS_GFX_DEVICE(drhd->devices[i]))
2710 if (i < drhd->devices_cnt)
2713 /* bypass IOMMU if it is just for gfx devices */
2715 for (i = 0; i < drhd->devices_cnt; i++) {
2716 if (!drhd->devices[i])
2718 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2723 int __init intel_iommu_init(void)
2727 if (dmar_table_init())
2730 if (dmar_dev_scope_init())
2734 * Check the need for DMA-remapping initialization now.
2735 * Above initialization will also be used by Interrupt-remapping.
2737 if (no_iommu || swiotlb || dmar_disabled)
2740 iommu_init_mempool();
2741 dmar_init_reserved_ranges();
2743 init_no_remapping_devices();
2747 printk(KERN_ERR "IOMMU: dmar init failed\n");
2748 put_iova_domain(&reserved_iova_list);
2749 iommu_exit_mempool();
2753 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2755 init_timer(&unmap_timer);
2757 dma_ops = &intel_dma_ops;
2759 register_iommu(&intel_iommu_ops);
2764 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2765 struct pci_dev *pdev)
2767 struct device_domain_info *info;
2768 unsigned long flags;
2770 info = alloc_devinfo_mem();
2774 info->bus = pdev->bus->number;
2775 info->devfn = pdev->devfn;
2777 info->domain = domain;
2779 spin_lock_irqsave(&device_domain_lock, flags);
2780 list_add(&info->link, &domain->devices);
2781 list_add(&info->global, &device_domain_list);
2782 pdev->dev.archdata.iommu = info;
2783 spin_unlock_irqrestore(&device_domain_lock, flags);
2788 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2789 struct pci_dev *pdev)
2791 struct device_domain_info *info;
2792 struct intel_iommu *iommu;
2793 unsigned long flags;
2795 struct list_head *entry, *tmp;
2797 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2801 spin_lock_irqsave(&device_domain_lock, flags);
2802 list_for_each_safe(entry, tmp, &domain->devices) {
2803 info = list_entry(entry, struct device_domain_info, link);
2804 if (info->bus == pdev->bus->number &&
2805 info->devfn == pdev->devfn) {
2806 list_del(&info->link);
2807 list_del(&info->global);
2809 info->dev->dev.archdata.iommu = NULL;
2810 spin_unlock_irqrestore(&device_domain_lock, flags);
2812 iommu_detach_dev(iommu, info->bus, info->devfn);
2813 free_devinfo_mem(info);
2815 spin_lock_irqsave(&device_domain_lock, flags);
2823 /* if there is no other devices under the same iommu
2824 * owned by this domain, clear this iommu in iommu_bmp
2825 * update iommu count and coherency
2827 if (device_to_iommu(info->bus, info->devfn) == iommu)
2832 unsigned long tmp_flags;
2833 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2834 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2835 domain->iommu_count--;
2836 domain_update_iommu_coherency(domain);
2837 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2840 spin_unlock_irqrestore(&device_domain_lock, flags);
2843 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2845 struct device_domain_info *info;
2846 struct intel_iommu *iommu;
2847 unsigned long flags1, flags2;
2849 spin_lock_irqsave(&device_domain_lock, flags1);
2850 while (!list_empty(&domain->devices)) {
2851 info = list_entry(domain->devices.next,
2852 struct device_domain_info, link);
2853 list_del(&info->link);
2854 list_del(&info->global);
2856 info->dev->dev.archdata.iommu = NULL;
2858 spin_unlock_irqrestore(&device_domain_lock, flags1);
2860 iommu = device_to_iommu(info->bus, info->devfn);
2861 iommu_detach_dev(iommu, info->bus, info->devfn);
2863 /* clear this iommu in iommu_bmp, update iommu count
2866 spin_lock_irqsave(&domain->iommu_lock, flags2);
2867 if (test_and_clear_bit(iommu->seq_id,
2868 &domain->iommu_bmp)) {
2869 domain->iommu_count--;
2870 domain_update_iommu_coherency(domain);
2872 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2874 free_devinfo_mem(info);
2875 spin_lock_irqsave(&device_domain_lock, flags1);
2877 spin_unlock_irqrestore(&device_domain_lock, flags1);
2880 /* domain id for virtual machine, it won't be set in context */
2881 static unsigned long vm_domid;
2883 static int vm_domain_min_agaw(struct dmar_domain *domain)
2886 int min_agaw = domain->agaw;
2888 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2889 for (; i < g_num_of_iommus; ) {
2890 if (min_agaw > g_iommus[i]->agaw)
2891 min_agaw = g_iommus[i]->agaw;
2893 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2899 static struct dmar_domain *iommu_alloc_vm_domain(void)
2901 struct dmar_domain *domain;
2903 domain = alloc_domain_mem();
2907 domain->id = vm_domid++;
2908 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2909 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2914 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2918 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2919 spin_lock_init(&domain->mapping_lock);
2920 spin_lock_init(&domain->iommu_lock);
2922 domain_reserve_special_ranges(domain);
2924 /* calculate AGAW */
2925 domain->gaw = guest_width;
2926 adjust_width = guestwidth_to_adjustwidth(guest_width);
2927 domain->agaw = width_to_agaw(adjust_width);
2929 INIT_LIST_HEAD(&domain->devices);
2931 domain->iommu_count = 0;
2932 domain->iommu_coherency = 0;
2933 domain->max_addr = 0;
2935 /* always allocate the top pgd */
2936 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2939 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2943 static void iommu_free_vm_domain(struct dmar_domain *domain)
2945 unsigned long flags;
2946 struct dmar_drhd_unit *drhd;
2947 struct intel_iommu *iommu;
2949 unsigned long ndomains;
2951 for_each_drhd_unit(drhd) {
2954 iommu = drhd->iommu;
2956 ndomains = cap_ndoms(iommu->cap);
2957 i = find_first_bit(iommu->domain_ids, ndomains);
2958 for (; i < ndomains; ) {
2959 if (iommu->domains[i] == domain) {
2960 spin_lock_irqsave(&iommu->lock, flags);
2961 clear_bit(i, iommu->domain_ids);
2962 iommu->domains[i] = NULL;
2963 spin_unlock_irqrestore(&iommu->lock, flags);
2966 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2971 static void vm_domain_exit(struct dmar_domain *domain)
2975 /* Domain 0 is reserved, so dont process it */
2979 vm_domain_remove_all_dev_info(domain);
2981 put_iova_domain(&domain->iovad);
2982 end = DOMAIN_MAX_ADDR(domain->gaw);
2983 end = end & (~VTD_PAGE_MASK);
2986 dma_pte_clear_range(domain, 0, end);
2988 /* free page tables */
2989 dma_pte_free_pagetable(domain, 0, end);
2991 iommu_free_vm_domain(domain);
2992 free_domain_mem(domain);
2995 static int intel_iommu_domain_init(struct iommu_domain *domain)
2997 struct dmar_domain *dmar_domain;
2999 dmar_domain = iommu_alloc_vm_domain();
3002 "intel_iommu_domain_init: dmar_domain == NULL\n");
3005 if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3007 "intel_iommu_domain_init() failed\n");
3008 vm_domain_exit(dmar_domain);
3011 domain->priv = dmar_domain;
3016 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3018 struct dmar_domain *dmar_domain = domain->priv;
3020 domain->priv = NULL;
3021 vm_domain_exit(dmar_domain);
3024 static int intel_iommu_attach_device(struct iommu_domain *domain,
3027 struct dmar_domain *dmar_domain = domain->priv;
3028 struct pci_dev *pdev = to_pci_dev(dev);
3029 struct intel_iommu *iommu;
3034 /* normally pdev is not mapped */
3035 if (unlikely(domain_context_mapped(pdev))) {
3036 struct dmar_domain *old_domain;
3038 old_domain = find_domain(pdev);
3040 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3041 vm_domain_remove_one_dev_info(old_domain, pdev);
3043 domain_remove_dev_info(old_domain);
3047 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3051 /* check if this iommu agaw is sufficient for max mapped address */
3052 addr_width = agaw_to_width(iommu->agaw);
3053 end = DOMAIN_MAX_ADDR(addr_width);
3054 end = end & VTD_PAGE_MASK;
3055 if (end < dmar_domain->max_addr) {
3056 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3057 "sufficient for the mapped address (%llx)\n",
3058 __func__, iommu->agaw, dmar_domain->max_addr);
3062 ret = domain_context_mapping(dmar_domain, pdev);
3066 ret = vm_domain_add_dev_info(dmar_domain, pdev);
3070 static void intel_iommu_detach_device(struct iommu_domain *domain,
3073 struct dmar_domain *dmar_domain = domain->priv;
3074 struct pci_dev *pdev = to_pci_dev(dev);
3076 vm_domain_remove_one_dev_info(dmar_domain, pdev);
3079 static int intel_iommu_map_range(struct iommu_domain *domain,
3080 unsigned long iova, phys_addr_t hpa,
3081 size_t size, int iommu_prot)
3083 struct dmar_domain *dmar_domain = domain->priv;
3089 if (iommu_prot & IOMMU_READ)
3090 prot |= DMA_PTE_READ;
3091 if (iommu_prot & IOMMU_WRITE)
3092 prot |= DMA_PTE_WRITE;
3094 max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3095 if (dmar_domain->max_addr < max_addr) {
3099 /* check if minimum agaw is sufficient for mapped address */
3100 min_agaw = vm_domain_min_agaw(dmar_domain);
3101 addr_width = agaw_to_width(min_agaw);
3102 end = DOMAIN_MAX_ADDR(addr_width);
3103 end = end & VTD_PAGE_MASK;
3104 if (end < max_addr) {
3105 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3106 "sufficient for the mapped address (%llx)\n",
3107 __func__, min_agaw, max_addr);
3110 dmar_domain->max_addr = max_addr;
3113 ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3117 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3118 unsigned long iova, size_t size)
3120 struct dmar_domain *dmar_domain = domain->priv;
3123 /* The address might not be aligned */
3124 base = iova & VTD_PAGE_MASK;
3125 size = VTD_PAGE_ALIGN(size);
3126 dma_pte_clear_range(dmar_domain, base, base + size);
3128 if (dmar_domain->max_addr == base + size)
3129 dmar_domain->max_addr = base;
3132 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3135 struct dmar_domain *dmar_domain = domain->priv;
3136 struct dma_pte *pte;
3139 pte = addr_to_dma_pte(dmar_domain, iova);
3141 phys = dma_pte_addr(pte);
3146 static struct iommu_ops intel_iommu_ops = {
3147 .domain_init = intel_iommu_domain_init,
3148 .domain_destroy = intel_iommu_domain_destroy,
3149 .attach_dev = intel_iommu_attach_device,
3150 .detach_dev = intel_iommu_detach_device,
3151 .map = intel_iommu_map_range,
3152 .unmap = intel_iommu_unmap_range,
3153 .iova_to_phys = intel_iommu_iova_to_phys,