2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define ROOT_SIZE VTD_PAGE_SIZE
44 #define CONTEXT_SIZE VTD_PAGE_SIZE
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
49 #define IOAPIC_RANGE_START (0xfee00000)
50 #define IOAPIC_RANGE_END (0xfeefffff)
51 #define IOVA_START_ADDR (0x1000)
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
57 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
64 static int rwbf_quirk;
69 * 12-63: Context Ptr (12 - (haw-1))
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
79 return (root->val & 1);
81 static inline void set_root_present(struct root_entry *root)
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
87 root->val |= value & VTD_PAGE_MASK;
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
93 return (struct context_entry *)
94 (root_present(root)?phys_to_virt(
95 root->val & VTD_PAGE_MASK) :
102 * 1: fault processing disable
103 * 2-3: translation type
104 * 12-63: address space root
110 struct context_entry {
115 static inline bool context_present(struct context_entry *context)
117 return (context->lo & 1);
119 static inline void context_set_present(struct context_entry *context)
124 static inline void context_set_fault_enable(struct context_entry *context)
126 context->lo &= (((u64)-1) << 2) | 1;
129 #define CONTEXT_TT_MULTI_LEVEL 0
131 static inline void context_set_translation_type(struct context_entry *context,
134 context->lo &= (((u64)-1) << 4) | 3;
135 context->lo |= (value & 3) << 2;
138 static inline void context_set_address_root(struct context_entry *context,
141 context->lo |= value & VTD_PAGE_MASK;
144 static inline void context_set_address_width(struct context_entry *context,
147 context->hi |= value & 7;
150 static inline void context_set_domain_id(struct context_entry *context,
153 context->hi |= (value & ((1 << 16) - 1)) << 8;
156 static inline void context_clear_entry(struct context_entry *context)
168 * 12-63: Host physcial address
174 static inline void dma_clear_pte(struct dma_pte *pte)
179 static inline void dma_set_pte_readable(struct dma_pte *pte)
181 pte->val |= DMA_PTE_READ;
184 static inline void dma_set_pte_writable(struct dma_pte *pte)
186 pte->val |= DMA_PTE_WRITE;
189 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
191 pte->val = (pte->val & ~3) | (prot & 3);
194 static inline u64 dma_pte_addr(struct dma_pte *pte)
196 return (pte->val & VTD_PAGE_MASK);
199 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
201 pte->val |= (addr & VTD_PAGE_MASK);
204 static inline bool dma_pte_present(struct dma_pte *pte)
206 return (pte->val & 3) != 0;
209 /* devices under the same p2p bridge are owned in one domain */
210 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
212 /* domain represents a virtual machine, more than one devices
213 * across iommus may be owned in one domain, e.g. kvm guest.
215 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
218 int id; /* domain id */
219 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
221 struct list_head devices; /* all devices' list */
222 struct iova_domain iovad; /* iova's that belong to this domain */
224 struct dma_pte *pgd; /* virtual address */
225 spinlock_t mapping_lock; /* page table lock */
226 int gaw; /* max guest address width */
228 /* adjusted guest address width, 0 is level 2 30-bit */
231 int flags; /* flags to find out type of domain */
233 int iommu_coherency;/* indicate coherency of iommu access */
234 int iommu_count; /* reference count of iommu */
235 spinlock_t iommu_lock; /* protect iommu set in domain */
236 u64 max_addr; /* maximum mapped address */
239 /* PCI domain-device relationship */
240 struct device_domain_info {
241 struct list_head link; /* link to domain siblings */
242 struct list_head global; /* link to global list */
243 u8 bus; /* PCI bus numer */
244 u8 devfn; /* PCI devfn number */
245 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
246 struct dmar_domain *domain; /* pointer to domain */
249 static void flush_unmaps_timeout(unsigned long data);
251 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
253 #define HIGH_WATER_MARK 250
254 struct deferred_flush_tables {
256 struct iova *iova[HIGH_WATER_MARK];
257 struct dmar_domain *domain[HIGH_WATER_MARK];
260 static struct deferred_flush_tables *deferred_flush;
262 /* bitmap for indexing intel_iommus */
263 static int g_num_of_iommus;
265 static DEFINE_SPINLOCK(async_umap_flush_lock);
266 static LIST_HEAD(unmaps_to_do);
269 static long list_size;
271 static void domain_remove_dev_info(struct dmar_domain *domain);
273 #ifdef CONFIG_DMAR_DEFAULT_ON
274 int dmar_disabled = 0;
276 int dmar_disabled = 1;
277 #endif /*CONFIG_DMAR_DEFAULT_ON*/
279 static int __initdata dmar_map_gfx = 1;
280 static int dmar_forcedac;
281 static int intel_iommu_strict;
283 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
284 static DEFINE_SPINLOCK(device_domain_lock);
285 static LIST_HEAD(device_domain_list);
287 static struct iommu_ops intel_iommu_ops;
289 static int __init intel_iommu_setup(char *str)
294 if (!strncmp(str, "on", 2)) {
296 printk(KERN_INFO "Intel-IOMMU: enabled\n");
297 } else if (!strncmp(str, "off", 3)) {
299 printk(KERN_INFO "Intel-IOMMU: disabled\n");
300 } else if (!strncmp(str, "igfx_off", 8)) {
303 "Intel-IOMMU: disable GFX device mapping\n");
304 } else if (!strncmp(str, "forcedac", 8)) {
306 "Intel-IOMMU: Forcing DAC for PCI devices\n");
308 } else if (!strncmp(str, "strict", 6)) {
310 "Intel-IOMMU: disable batched IOTLB flush\n");
311 intel_iommu_strict = 1;
314 str += strcspn(str, ",");
320 __setup("intel_iommu=", intel_iommu_setup);
322 static struct kmem_cache *iommu_domain_cache;
323 static struct kmem_cache *iommu_devinfo_cache;
324 static struct kmem_cache *iommu_iova_cache;
326 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
331 /* trying to avoid low memory issues */
332 flags = current->flags & PF_MEMALLOC;
333 current->flags |= PF_MEMALLOC;
334 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
335 current->flags &= (~PF_MEMALLOC | flags);
340 static inline void *alloc_pgtable_page(void)
345 /* trying to avoid low memory issues */
346 flags = current->flags & PF_MEMALLOC;
347 current->flags |= PF_MEMALLOC;
348 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
349 current->flags &= (~PF_MEMALLOC | flags);
353 static inline void free_pgtable_page(void *vaddr)
355 free_page((unsigned long)vaddr);
358 static inline void *alloc_domain_mem(void)
360 return iommu_kmem_cache_alloc(iommu_domain_cache);
363 static void free_domain_mem(void *vaddr)
365 kmem_cache_free(iommu_domain_cache, vaddr);
368 static inline void * alloc_devinfo_mem(void)
370 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
373 static inline void free_devinfo_mem(void *vaddr)
375 kmem_cache_free(iommu_devinfo_cache, vaddr);
378 struct iova *alloc_iova_mem(void)
380 return iommu_kmem_cache_alloc(iommu_iova_cache);
383 void free_iova_mem(struct iova *iova)
385 kmem_cache_free(iommu_iova_cache, iova);
389 static inline int width_to_agaw(int width);
391 /* calculate agaw for each iommu.
392 * "SAGAW" may be different across iommus, use a default agaw, and
393 * get a supported less agaw for iommus that don't support the default agaw.
395 int iommu_calculate_agaw(struct intel_iommu *iommu)
400 sagaw = cap_sagaw(iommu->cap);
401 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
403 if (test_bit(agaw, &sagaw))
410 /* in native case, each domain is related to only one iommu */
411 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
415 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
417 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
418 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
421 return g_iommus[iommu_id];
424 /* "Coherency" capability may be different across iommus */
425 static void domain_update_iommu_coherency(struct dmar_domain *domain)
429 domain->iommu_coherency = 1;
431 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
432 for (; i < g_num_of_iommus; ) {
433 if (!ecap_coherent(g_iommus[i]->ecap)) {
434 domain->iommu_coherency = 0;
437 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
441 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
443 struct dmar_drhd_unit *drhd = NULL;
446 for_each_drhd_unit(drhd) {
450 for (i = 0; i < drhd->devices_cnt; i++)
451 if (drhd->devices[i] &&
452 drhd->devices[i]->bus->number == bus &&
453 drhd->devices[i]->devfn == devfn)
456 if (drhd->include_all)
463 static void domain_flush_cache(struct dmar_domain *domain,
464 void *addr, int size)
466 if (!domain->iommu_coherency)
467 clflush_cache_range(addr, size);
470 /* Gets context entry for a given bus and devfn */
471 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
474 struct root_entry *root;
475 struct context_entry *context;
476 unsigned long phy_addr;
479 spin_lock_irqsave(&iommu->lock, flags);
480 root = &iommu->root_entry[bus];
481 context = get_context_addr_from_root(root);
483 context = (struct context_entry *)alloc_pgtable_page();
485 spin_unlock_irqrestore(&iommu->lock, flags);
488 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
489 phy_addr = virt_to_phys((void *)context);
490 set_root_value(root, phy_addr);
491 set_root_present(root);
492 __iommu_flush_cache(iommu, root, sizeof(*root));
494 spin_unlock_irqrestore(&iommu->lock, flags);
495 return &context[devfn];
498 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
500 struct root_entry *root;
501 struct context_entry *context;
505 spin_lock_irqsave(&iommu->lock, flags);
506 root = &iommu->root_entry[bus];
507 context = get_context_addr_from_root(root);
512 ret = context_present(&context[devfn]);
514 spin_unlock_irqrestore(&iommu->lock, flags);
518 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
520 struct root_entry *root;
521 struct context_entry *context;
524 spin_lock_irqsave(&iommu->lock, flags);
525 root = &iommu->root_entry[bus];
526 context = get_context_addr_from_root(root);
528 context_clear_entry(&context[devfn]);
529 __iommu_flush_cache(iommu, &context[devfn], \
532 spin_unlock_irqrestore(&iommu->lock, flags);
535 static void free_context_table(struct intel_iommu *iommu)
537 struct root_entry *root;
540 struct context_entry *context;
542 spin_lock_irqsave(&iommu->lock, flags);
543 if (!iommu->root_entry) {
546 for (i = 0; i < ROOT_ENTRY_NR; i++) {
547 root = &iommu->root_entry[i];
548 context = get_context_addr_from_root(root);
550 free_pgtable_page(context);
552 free_pgtable_page(iommu->root_entry);
553 iommu->root_entry = NULL;
555 spin_unlock_irqrestore(&iommu->lock, flags);
558 /* page table handling */
559 #define LEVEL_STRIDE (9)
560 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
562 static inline int agaw_to_level(int agaw)
567 static inline int agaw_to_width(int agaw)
569 return 30 + agaw * LEVEL_STRIDE;
573 static inline int width_to_agaw(int width)
575 return (width - 30) / LEVEL_STRIDE;
578 static inline unsigned int level_to_offset_bits(int level)
580 return (12 + (level - 1) * LEVEL_STRIDE);
583 static inline int address_level_offset(u64 addr, int level)
585 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
588 static inline u64 level_mask(int level)
590 return ((u64)-1 << level_to_offset_bits(level));
593 static inline u64 level_size(int level)
595 return ((u64)1 << level_to_offset_bits(level));
598 static inline u64 align_to_level(u64 addr, int level)
600 return ((addr + level_size(level) - 1) & level_mask(level));
603 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
605 int addr_width = agaw_to_width(domain->agaw);
606 struct dma_pte *parent, *pte = NULL;
607 int level = agaw_to_level(domain->agaw);
611 BUG_ON(!domain->pgd);
613 addr &= (((u64)1) << addr_width) - 1;
614 parent = domain->pgd;
616 spin_lock_irqsave(&domain->mapping_lock, flags);
620 offset = address_level_offset(addr, level);
621 pte = &parent[offset];
625 if (!dma_pte_present(pte)) {
626 tmp_page = alloc_pgtable_page();
629 spin_unlock_irqrestore(&domain->mapping_lock,
633 domain_flush_cache(domain, tmp_page, PAGE_SIZE);
634 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
636 * high level table always sets r/w, last level page
637 * table control read/write
639 dma_set_pte_readable(pte);
640 dma_set_pte_writable(pte);
641 domain_flush_cache(domain, pte, sizeof(*pte));
643 parent = phys_to_virt(dma_pte_addr(pte));
647 spin_unlock_irqrestore(&domain->mapping_lock, flags);
651 /* return address's pte at specific level */
652 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
655 struct dma_pte *parent, *pte = NULL;
656 int total = agaw_to_level(domain->agaw);
659 parent = domain->pgd;
660 while (level <= total) {
661 offset = address_level_offset(addr, total);
662 pte = &parent[offset];
666 if (!dma_pte_present(pte))
668 parent = phys_to_virt(dma_pte_addr(pte));
674 /* clear one page's page table */
675 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
677 struct dma_pte *pte = NULL;
679 /* get last level pte */
680 pte = dma_addr_level_pte(domain, addr, 1);
684 domain_flush_cache(domain, pte, sizeof(*pte));
688 /* clear last level pte, a tlb flush should be followed */
689 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
691 int addr_width = agaw_to_width(domain->agaw);
693 start &= (((u64)1) << addr_width) - 1;
694 end &= (((u64)1) << addr_width) - 1;
695 /* in case it's partial page */
696 start = PAGE_ALIGN(start);
699 /* we don't need lock here, nobody else touches the iova range */
700 while (start < end) {
701 dma_pte_clear_one(domain, start);
702 start += VTD_PAGE_SIZE;
706 /* free page table pages. last level pte should already be cleared */
707 static void dma_pte_free_pagetable(struct dmar_domain *domain,
710 int addr_width = agaw_to_width(domain->agaw);
712 int total = agaw_to_level(domain->agaw);
716 start &= (((u64)1) << addr_width) - 1;
717 end &= (((u64)1) << addr_width) - 1;
719 /* we don't need lock here, nobody else touches the iova range */
721 while (level <= total) {
722 tmp = align_to_level(start, level);
723 if (tmp >= end || (tmp + level_size(level) > end))
727 pte = dma_addr_level_pte(domain, tmp, level);
730 phys_to_virt(dma_pte_addr(pte)));
732 domain_flush_cache(domain, pte, sizeof(*pte));
734 tmp += level_size(level);
739 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
740 free_pgtable_page(domain->pgd);
746 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
748 struct root_entry *root;
751 root = (struct root_entry *)alloc_pgtable_page();
755 __iommu_flush_cache(iommu, root, ROOT_SIZE);
757 spin_lock_irqsave(&iommu->lock, flags);
758 iommu->root_entry = root;
759 spin_unlock_irqrestore(&iommu->lock, flags);
764 static void iommu_set_root_entry(struct intel_iommu *iommu)
770 addr = iommu->root_entry;
772 spin_lock_irqsave(&iommu->register_lock, flag);
773 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
775 cmd = iommu->gcmd | DMA_GCMD_SRTP;
776 writel(cmd, iommu->reg + DMAR_GCMD_REG);
778 /* Make sure hardware complete it */
779 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
780 readl, (sts & DMA_GSTS_RTPS), sts);
782 spin_unlock_irqrestore(&iommu->register_lock, flag);
785 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
790 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
792 val = iommu->gcmd | DMA_GCMD_WBF;
794 spin_lock_irqsave(&iommu->register_lock, flag);
795 writel(val, iommu->reg + DMAR_GCMD_REG);
797 /* Make sure hardware complete it */
798 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
799 readl, (!(val & DMA_GSTS_WBFS)), val);
801 spin_unlock_irqrestore(&iommu->register_lock, flag);
804 /* return value determine if we need a write buffer flush */
805 static int __iommu_flush_context(struct intel_iommu *iommu,
806 u16 did, u16 source_id, u8 function_mask, u64 type,
807 int non_present_entry_flush)
813 * In the non-present entry flush case, if hardware doesn't cache
814 * non-present entry we do nothing and if hardware cache non-present
815 * entry, we flush entries of domain 0 (the domain id is used to cache
816 * any non-present entries)
818 if (non_present_entry_flush) {
819 if (!cap_caching_mode(iommu->cap))
826 case DMA_CCMD_GLOBAL_INVL:
827 val = DMA_CCMD_GLOBAL_INVL;
829 case DMA_CCMD_DOMAIN_INVL:
830 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
832 case DMA_CCMD_DEVICE_INVL:
833 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
834 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
841 spin_lock_irqsave(&iommu->register_lock, flag);
842 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
844 /* Make sure hardware complete it */
845 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
846 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
848 spin_unlock_irqrestore(&iommu->register_lock, flag);
850 /* flush context entry will implicitly flush write buffer */
854 /* return value determine if we need a write buffer flush */
855 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
856 u64 addr, unsigned int size_order, u64 type,
857 int non_present_entry_flush)
859 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
860 u64 val = 0, val_iva = 0;
864 * In the non-present entry flush case, if hardware doesn't cache
865 * non-present entry we do nothing and if hardware cache non-present
866 * entry, we flush entries of domain 0 (the domain id is used to cache
867 * any non-present entries)
869 if (non_present_entry_flush) {
870 if (!cap_caching_mode(iommu->cap))
877 case DMA_TLB_GLOBAL_FLUSH:
878 /* global flush doesn't need set IVA_REG */
879 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
881 case DMA_TLB_DSI_FLUSH:
882 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
884 case DMA_TLB_PSI_FLUSH:
885 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
886 /* Note: always flush non-leaf currently */
887 val_iva = size_order | addr;
892 /* Note: set drain read/write */
895 * This is probably to be super secure.. Looks like we can
896 * ignore it without any impact.
898 if (cap_read_drain(iommu->cap))
899 val |= DMA_TLB_READ_DRAIN;
901 if (cap_write_drain(iommu->cap))
902 val |= DMA_TLB_WRITE_DRAIN;
904 spin_lock_irqsave(&iommu->register_lock, flag);
905 /* Note: Only uses first TLB reg currently */
907 dmar_writeq(iommu->reg + tlb_offset, val_iva);
908 dmar_writeq(iommu->reg + tlb_offset + 8, val);
910 /* Make sure hardware complete it */
911 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
912 dmar_readq, (!(val & DMA_TLB_IVT)), val);
914 spin_unlock_irqrestore(&iommu->register_lock, flag);
916 /* check IOTLB invalidation granularity */
917 if (DMA_TLB_IAIG(val) == 0)
918 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
919 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
920 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
921 (unsigned long long)DMA_TLB_IIRG(type),
922 (unsigned long long)DMA_TLB_IAIG(val));
923 /* flush iotlb entry will implicitly flush write buffer */
927 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
928 u64 addr, unsigned int pages, int non_present_entry_flush)
932 BUG_ON(addr & (~VTD_PAGE_MASK));
935 /* Fallback to domain selective flush if no PSI support */
936 if (!cap_pgsel_inv(iommu->cap))
937 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
939 non_present_entry_flush);
942 * PSI requires page size to be 2 ^ x, and the base address is naturally
943 * aligned to the size
945 mask = ilog2(__roundup_pow_of_two(pages));
946 /* Fallback to domain selective flush if size is too big */
947 if (mask > cap_max_amask_val(iommu->cap))
948 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
949 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
951 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
953 non_present_entry_flush);
956 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
961 spin_lock_irqsave(&iommu->register_lock, flags);
962 pmen = readl(iommu->reg + DMAR_PMEN_REG);
963 pmen &= ~DMA_PMEN_EPM;
964 writel(pmen, iommu->reg + DMAR_PMEN_REG);
966 /* wait for the protected region status bit to clear */
967 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
968 readl, !(pmen & DMA_PMEN_PRS), pmen);
970 spin_unlock_irqrestore(&iommu->register_lock, flags);
973 static int iommu_enable_translation(struct intel_iommu *iommu)
978 spin_lock_irqsave(&iommu->register_lock, flags);
979 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
981 /* Make sure hardware complete it */
982 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
983 readl, (sts & DMA_GSTS_TES), sts);
985 iommu->gcmd |= DMA_GCMD_TE;
986 spin_unlock_irqrestore(&iommu->register_lock, flags);
990 static int iommu_disable_translation(struct intel_iommu *iommu)
995 spin_lock_irqsave(&iommu->register_lock, flag);
996 iommu->gcmd &= ~DMA_GCMD_TE;
997 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
999 /* Make sure hardware complete it */
1000 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1001 readl, (!(sts & DMA_GSTS_TES)), sts);
1003 spin_unlock_irqrestore(&iommu->register_lock, flag);
1007 /* iommu interrupt handling. Most stuff are MSI-like. */
1009 static const char *fault_reason_strings[] =
1012 "Present bit in root entry is clear",
1013 "Present bit in context entry is clear",
1014 "Invalid context entry",
1015 "Access beyond MGAW",
1016 "PTE Write access is not set",
1017 "PTE Read access is not set",
1018 "Next page table ptr is invalid",
1019 "Root table address invalid",
1020 "Context table ptr is invalid",
1021 "non-zero reserved fields in RTP",
1022 "non-zero reserved fields in CTP",
1023 "non-zero reserved fields in PTE",
1025 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1027 const char *dmar_get_fault_reason(u8 fault_reason)
1029 if (fault_reason > MAX_FAULT_REASON_IDX)
1032 return fault_reason_strings[fault_reason];
1035 void dmar_msi_unmask(unsigned int irq)
1037 struct intel_iommu *iommu = get_irq_data(irq);
1041 spin_lock_irqsave(&iommu->register_lock, flag);
1042 writel(0, iommu->reg + DMAR_FECTL_REG);
1043 /* Read a reg to force flush the post write */
1044 readl(iommu->reg + DMAR_FECTL_REG);
1045 spin_unlock_irqrestore(&iommu->register_lock, flag);
1048 void dmar_msi_mask(unsigned int irq)
1051 struct intel_iommu *iommu = get_irq_data(irq);
1054 spin_lock_irqsave(&iommu->register_lock, flag);
1055 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1056 /* Read a reg to force flush the post write */
1057 readl(iommu->reg + DMAR_FECTL_REG);
1058 spin_unlock_irqrestore(&iommu->register_lock, flag);
1061 void dmar_msi_write(int irq, struct msi_msg *msg)
1063 struct intel_iommu *iommu = get_irq_data(irq);
1066 spin_lock_irqsave(&iommu->register_lock, flag);
1067 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1068 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1069 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1070 spin_unlock_irqrestore(&iommu->register_lock, flag);
1073 void dmar_msi_read(int irq, struct msi_msg *msg)
1075 struct intel_iommu *iommu = get_irq_data(irq);
1078 spin_lock_irqsave(&iommu->register_lock, flag);
1079 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1080 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1081 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1082 spin_unlock_irqrestore(&iommu->register_lock, flag);
1085 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1086 u8 fault_reason, u16 source_id, unsigned long long addr)
1090 reason = dmar_get_fault_reason(fault_reason);
1093 "DMAR:[%s] Request device [%02x:%02x.%d] "
1094 "fault addr %llx \n"
1095 "DMAR:[fault reason %02d] %s\n",
1096 (type ? "DMA Read" : "DMA Write"),
1097 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1098 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1102 #define PRIMARY_FAULT_REG_LEN (16)
1103 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1105 struct intel_iommu *iommu = dev_id;
1106 int reg, fault_index;
1110 spin_lock_irqsave(&iommu->register_lock, flag);
1111 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1113 /* TBD: ignore advanced fault log currently */
1114 if (!(fault_status & DMA_FSTS_PPF))
1115 goto clear_overflow;
1117 fault_index = dma_fsts_fault_record_index(fault_status);
1118 reg = cap_fault_reg_offset(iommu->cap);
1126 /* highest 32 bits */
1127 data = readl(iommu->reg + reg +
1128 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1129 if (!(data & DMA_FRCD_F))
1132 fault_reason = dma_frcd_fault_reason(data);
1133 type = dma_frcd_type(data);
1135 data = readl(iommu->reg + reg +
1136 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1137 source_id = dma_frcd_source_id(data);
1139 guest_addr = dmar_readq(iommu->reg + reg +
1140 fault_index * PRIMARY_FAULT_REG_LEN);
1141 guest_addr = dma_frcd_page_addr(guest_addr);
1142 /* clear the fault */
1143 writel(DMA_FRCD_F, iommu->reg + reg +
1144 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1146 spin_unlock_irqrestore(&iommu->register_lock, flag);
1148 iommu_page_fault_do_one(iommu, type, fault_reason,
1149 source_id, guest_addr);
1152 if (fault_index > cap_num_fault_regs(iommu->cap))
1154 spin_lock_irqsave(&iommu->register_lock, flag);
1157 /* clear primary fault overflow */
1158 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1159 if (fault_status & DMA_FSTS_PFO)
1160 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1162 spin_unlock_irqrestore(&iommu->register_lock, flag);
1166 int dmar_set_interrupt(struct intel_iommu *iommu)
1172 printk(KERN_ERR "IOMMU: no free vectors\n");
1176 set_irq_data(irq, iommu);
1179 ret = arch_setup_dmar_msi(irq);
1181 set_irq_data(irq, NULL);
1187 /* Force fault register is cleared */
1188 iommu_page_fault(irq, iommu);
1190 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1192 printk(KERN_ERR "IOMMU: can't request irq\n");
1196 static int iommu_init_domains(struct intel_iommu *iommu)
1198 unsigned long ndomains;
1199 unsigned long nlongs;
1201 ndomains = cap_ndoms(iommu->cap);
1202 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1203 nlongs = BITS_TO_LONGS(ndomains);
1205 /* TBD: there might be 64K domains,
1206 * consider other allocation for future chip
1208 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1209 if (!iommu->domain_ids) {
1210 printk(KERN_ERR "Allocating domain id array failed\n");
1213 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1215 if (!iommu->domains) {
1216 printk(KERN_ERR "Allocating domain array failed\n");
1217 kfree(iommu->domain_ids);
1221 spin_lock_init(&iommu->lock);
1224 * if Caching mode is set, then invalid translations are tagged
1225 * with domainid 0. Hence we need to pre-allocate it.
1227 if (cap_caching_mode(iommu->cap))
1228 set_bit(0, iommu->domain_ids);
1233 static void domain_exit(struct dmar_domain *domain);
1234 static void vm_domain_exit(struct dmar_domain *domain);
1236 void free_dmar_iommu(struct intel_iommu *iommu)
1238 struct dmar_domain *domain;
1240 unsigned long flags;
1242 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1243 for (; i < cap_ndoms(iommu->cap); ) {
1244 domain = iommu->domains[i];
1245 clear_bit(i, iommu->domain_ids);
1247 spin_lock_irqsave(&domain->iommu_lock, flags);
1248 if (--domain->iommu_count == 0) {
1249 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1250 vm_domain_exit(domain);
1252 domain_exit(domain);
1254 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1256 i = find_next_bit(iommu->domain_ids,
1257 cap_ndoms(iommu->cap), i+1);
1260 if (iommu->gcmd & DMA_GCMD_TE)
1261 iommu_disable_translation(iommu);
1264 set_irq_data(iommu->irq, NULL);
1265 /* This will mask the irq */
1266 free_irq(iommu->irq, iommu);
1267 destroy_irq(iommu->irq);
1270 kfree(iommu->domains);
1271 kfree(iommu->domain_ids);
1273 g_iommus[iommu->seq_id] = NULL;
1275 /* if all iommus are freed, free g_iommus */
1276 for (i = 0; i < g_num_of_iommus; i++) {
1281 if (i == g_num_of_iommus)
1284 /* free context mapping */
1285 free_context_table(iommu);
1288 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1291 unsigned long ndomains;
1292 struct dmar_domain *domain;
1293 unsigned long flags;
1295 domain = alloc_domain_mem();
1299 ndomains = cap_ndoms(iommu->cap);
1301 spin_lock_irqsave(&iommu->lock, flags);
1302 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1303 if (num >= ndomains) {
1304 spin_unlock_irqrestore(&iommu->lock, flags);
1305 free_domain_mem(domain);
1306 printk(KERN_ERR "IOMMU: no free domain ids\n");
1310 set_bit(num, iommu->domain_ids);
1312 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1313 set_bit(iommu->seq_id, &domain->iommu_bmp);
1315 iommu->domains[num] = domain;
1316 spin_unlock_irqrestore(&iommu->lock, flags);
1321 static void iommu_free_domain(struct dmar_domain *domain)
1323 unsigned long flags;
1324 struct intel_iommu *iommu;
1326 iommu = domain_get_iommu(domain);
1328 spin_lock_irqsave(&iommu->lock, flags);
1329 clear_bit(domain->id, iommu->domain_ids);
1330 spin_unlock_irqrestore(&iommu->lock, flags);
1333 static struct iova_domain reserved_iova_list;
1334 static struct lock_class_key reserved_alloc_key;
1335 static struct lock_class_key reserved_rbtree_key;
1337 static void dmar_init_reserved_ranges(void)
1339 struct pci_dev *pdev = NULL;
1344 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1346 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1347 &reserved_alloc_key);
1348 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1349 &reserved_rbtree_key);
1351 /* IOAPIC ranges shouldn't be accessed by DMA */
1352 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1353 IOVA_PFN(IOAPIC_RANGE_END));
1355 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1357 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1358 for_each_pci_dev(pdev) {
1361 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1362 r = &pdev->resource[i];
1363 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1367 size = r->end - addr;
1368 size = PAGE_ALIGN(size);
1369 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1370 IOVA_PFN(size + addr) - 1);
1372 printk(KERN_ERR "Reserve iova failed\n");
1378 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1380 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1383 static inline int guestwidth_to_adjustwidth(int gaw)
1386 int r = (gaw - 12) % 9;
1397 static int domain_init(struct dmar_domain *domain, int guest_width)
1399 struct intel_iommu *iommu;
1400 int adjust_width, agaw;
1401 unsigned long sagaw;
1403 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1404 spin_lock_init(&domain->mapping_lock);
1405 spin_lock_init(&domain->iommu_lock);
1407 domain_reserve_special_ranges(domain);
1409 /* calculate AGAW */
1410 iommu = domain_get_iommu(domain);
1411 if (guest_width > cap_mgaw(iommu->cap))
1412 guest_width = cap_mgaw(iommu->cap);
1413 domain->gaw = guest_width;
1414 adjust_width = guestwidth_to_adjustwidth(guest_width);
1415 agaw = width_to_agaw(adjust_width);
1416 sagaw = cap_sagaw(iommu->cap);
1417 if (!test_bit(agaw, &sagaw)) {
1418 /* hardware doesn't support it, choose a bigger one */
1419 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1420 agaw = find_next_bit(&sagaw, 5, agaw);
1424 domain->agaw = agaw;
1425 INIT_LIST_HEAD(&domain->devices);
1427 if (ecap_coherent(iommu->ecap))
1428 domain->iommu_coherency = 1;
1430 domain->iommu_coherency = 0;
1432 domain->iommu_count = 1;
1434 /* always allocate the top pgd */
1435 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1438 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1442 static void domain_exit(struct dmar_domain *domain)
1446 /* Domain 0 is reserved, so dont process it */
1450 domain_remove_dev_info(domain);
1452 put_iova_domain(&domain->iovad);
1453 end = DOMAIN_MAX_ADDR(domain->gaw);
1454 end = end & (~PAGE_MASK);
1457 dma_pte_clear_range(domain, 0, end);
1459 /* free page tables */
1460 dma_pte_free_pagetable(domain, 0, end);
1462 iommu_free_domain(domain);
1463 free_domain_mem(domain);
1466 static int domain_context_mapping_one(struct dmar_domain *domain,
1469 struct context_entry *context;
1470 unsigned long flags;
1471 struct intel_iommu *iommu;
1472 struct dma_pte *pgd;
1474 unsigned long ndomains;
1478 pr_debug("Set context mapping for %02x:%02x.%d\n",
1479 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1480 BUG_ON(!domain->pgd);
1482 iommu = device_to_iommu(bus, devfn);
1486 context = device_to_context_entry(iommu, bus, devfn);
1489 spin_lock_irqsave(&iommu->lock, flags);
1490 if (context_present(context)) {
1491 spin_unlock_irqrestore(&iommu->lock, flags);
1498 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1501 /* find an available domain id for this device in iommu */
1502 ndomains = cap_ndoms(iommu->cap);
1503 num = find_first_bit(iommu->domain_ids, ndomains);
1504 for (; num < ndomains; ) {
1505 if (iommu->domains[num] == domain) {
1510 num = find_next_bit(iommu->domain_ids,
1511 cap_ndoms(iommu->cap), num+1);
1515 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1516 if (num >= ndomains) {
1517 spin_unlock_irqrestore(&iommu->lock, flags);
1518 printk(KERN_ERR "IOMMU: no free domain ids\n");
1522 set_bit(num, iommu->domain_ids);
1523 iommu->domains[num] = domain;
1527 /* Skip top levels of page tables for
1528 * iommu which has less agaw than default.
1530 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1531 pgd = phys_to_virt(dma_pte_addr(pgd));
1532 if (!dma_pte_present(pgd)) {
1533 spin_unlock_irqrestore(&iommu->lock, flags);
1539 context_set_domain_id(context, id);
1540 context_set_address_width(context, iommu->agaw);
1541 context_set_address_root(context, virt_to_phys(pgd));
1542 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1543 context_set_fault_enable(context);
1544 context_set_present(context);
1545 domain_flush_cache(domain, context, sizeof(*context));
1547 /* it's a non-present to present mapping */
1548 if (iommu->flush.flush_context(iommu, domain->id,
1549 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1550 DMA_CCMD_DEVICE_INVL, 1))
1551 iommu_flush_write_buffer(iommu);
1553 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1555 spin_unlock_irqrestore(&iommu->lock, flags);
1557 spin_lock_irqsave(&domain->iommu_lock, flags);
1558 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1559 domain->iommu_count++;
1560 domain_update_iommu_coherency(domain);
1562 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1567 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1570 struct pci_dev *tmp, *parent;
1572 ret = domain_context_mapping_one(domain, pdev->bus->number,
1577 /* dependent device mapping */
1578 tmp = pci_find_upstream_pcie_bridge(pdev);
1581 /* Secondary interface's bus number and devfn 0 */
1582 parent = pdev->bus->self;
1583 while (parent != tmp) {
1584 ret = domain_context_mapping_one(domain, parent->bus->number,
1588 parent = parent->bus->self;
1590 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1591 return domain_context_mapping_one(domain,
1592 tmp->subordinate->number, 0);
1593 else /* this is a legacy PCI bridge */
1594 return domain_context_mapping_one(domain,
1595 tmp->bus->number, tmp->devfn);
1598 static int domain_context_mapped(struct pci_dev *pdev)
1601 struct pci_dev *tmp, *parent;
1602 struct intel_iommu *iommu;
1604 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1608 ret = device_context_mapped(iommu,
1609 pdev->bus->number, pdev->devfn);
1612 /* dependent device mapping */
1613 tmp = pci_find_upstream_pcie_bridge(pdev);
1616 /* Secondary interface's bus number and devfn 0 */
1617 parent = pdev->bus->self;
1618 while (parent != tmp) {
1619 ret = device_context_mapped(iommu, parent->bus->number,
1623 parent = parent->bus->self;
1626 return device_context_mapped(iommu,
1627 tmp->subordinate->number, 0);
1629 return device_context_mapped(iommu,
1630 tmp->bus->number, tmp->devfn);
1634 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1635 u64 hpa, size_t size, int prot)
1637 u64 start_pfn, end_pfn;
1638 struct dma_pte *pte;
1640 int addr_width = agaw_to_width(domain->agaw);
1642 hpa &= (((u64)1) << addr_width) - 1;
1644 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1647 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1648 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1650 while (start_pfn < end_pfn) {
1651 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1654 /* We don't need lock here, nobody else
1655 * touches the iova range
1657 BUG_ON(dma_pte_addr(pte));
1658 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1659 dma_set_pte_prot(pte, prot);
1660 domain_flush_cache(domain, pte, sizeof(*pte));
1667 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1672 clear_context_table(iommu, bus, devfn);
1673 iommu->flush.flush_context(iommu, 0, 0, 0,
1674 DMA_CCMD_GLOBAL_INVL, 0);
1675 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1676 DMA_TLB_GLOBAL_FLUSH, 0);
1679 static void domain_remove_dev_info(struct dmar_domain *domain)
1681 struct device_domain_info *info;
1682 unsigned long flags;
1683 struct intel_iommu *iommu;
1685 spin_lock_irqsave(&device_domain_lock, flags);
1686 while (!list_empty(&domain->devices)) {
1687 info = list_entry(domain->devices.next,
1688 struct device_domain_info, link);
1689 list_del(&info->link);
1690 list_del(&info->global);
1692 info->dev->dev.archdata.iommu = NULL;
1693 spin_unlock_irqrestore(&device_domain_lock, flags);
1695 iommu = device_to_iommu(info->bus, info->devfn);
1696 iommu_detach_dev(iommu, info->bus, info->devfn);
1697 free_devinfo_mem(info);
1699 spin_lock_irqsave(&device_domain_lock, flags);
1701 spin_unlock_irqrestore(&device_domain_lock, flags);
1706 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1708 static struct dmar_domain *
1709 find_domain(struct pci_dev *pdev)
1711 struct device_domain_info *info;
1713 /* No lock here, assumes no domain exit in normal case */
1714 info = pdev->dev.archdata.iommu;
1716 return info->domain;
1720 /* domain is initialized */
1721 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1723 struct dmar_domain *domain, *found = NULL;
1724 struct intel_iommu *iommu;
1725 struct dmar_drhd_unit *drhd;
1726 struct device_domain_info *info, *tmp;
1727 struct pci_dev *dev_tmp;
1728 unsigned long flags;
1729 int bus = 0, devfn = 0;
1731 domain = find_domain(pdev);
1735 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1737 if (dev_tmp->is_pcie) {
1738 bus = dev_tmp->subordinate->number;
1741 bus = dev_tmp->bus->number;
1742 devfn = dev_tmp->devfn;
1744 spin_lock_irqsave(&device_domain_lock, flags);
1745 list_for_each_entry(info, &device_domain_list, global) {
1746 if (info->bus == bus && info->devfn == devfn) {
1747 found = info->domain;
1751 spin_unlock_irqrestore(&device_domain_lock, flags);
1752 /* pcie-pci bridge already has a domain, uses it */
1759 /* Allocate new domain for the device */
1760 drhd = dmar_find_matched_drhd_unit(pdev);
1762 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1766 iommu = drhd->iommu;
1768 domain = iommu_alloc_domain(iommu);
1772 if (domain_init(domain, gaw)) {
1773 domain_exit(domain);
1777 /* register pcie-to-pci device */
1779 info = alloc_devinfo_mem();
1781 domain_exit(domain);
1785 info->devfn = devfn;
1787 info->domain = domain;
1788 /* This domain is shared by devices under p2p bridge */
1789 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1791 /* pcie-to-pci bridge already has a domain, uses it */
1793 spin_lock_irqsave(&device_domain_lock, flags);
1794 list_for_each_entry(tmp, &device_domain_list, global) {
1795 if (tmp->bus == bus && tmp->devfn == devfn) {
1796 found = tmp->domain;
1801 free_devinfo_mem(info);
1802 domain_exit(domain);
1805 list_add(&info->link, &domain->devices);
1806 list_add(&info->global, &device_domain_list);
1808 spin_unlock_irqrestore(&device_domain_lock, flags);
1812 info = alloc_devinfo_mem();
1815 info->bus = pdev->bus->number;
1816 info->devfn = pdev->devfn;
1818 info->domain = domain;
1819 spin_lock_irqsave(&device_domain_lock, flags);
1820 /* somebody is fast */
1821 found = find_domain(pdev);
1822 if (found != NULL) {
1823 spin_unlock_irqrestore(&device_domain_lock, flags);
1824 if (found != domain) {
1825 domain_exit(domain);
1828 free_devinfo_mem(info);
1831 list_add(&info->link, &domain->devices);
1832 list_add(&info->global, &device_domain_list);
1833 pdev->dev.archdata.iommu = info;
1834 spin_unlock_irqrestore(&device_domain_lock, flags);
1837 /* recheck it here, maybe others set it */
1838 return find_domain(pdev);
1841 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1842 unsigned long long start,
1843 unsigned long long end)
1845 struct dmar_domain *domain;
1847 unsigned long long base;
1851 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1852 pci_name(pdev), start, end);
1853 /* page table init */
1854 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1858 /* The address might not be aligned */
1859 base = start & PAGE_MASK;
1861 size = PAGE_ALIGN(size);
1862 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1863 IOVA_PFN(base + size) - 1)) {
1864 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1869 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1870 size, base, pci_name(pdev));
1872 * RMRR range might have overlap with physical memory range,
1875 dma_pte_clear_range(domain, base, base + size);
1877 ret = domain_page_mapping(domain, base, base, size,
1878 DMA_PTE_READ|DMA_PTE_WRITE);
1882 /* context entry init */
1883 ret = domain_context_mapping(domain, pdev);
1887 domain_exit(domain);
1892 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1893 struct pci_dev *pdev)
1895 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1897 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1898 rmrr->end_address + 1);
1901 #ifdef CONFIG_DMAR_GFX_WA
1902 struct iommu_prepare_data {
1903 struct pci_dev *pdev;
1907 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1908 unsigned long end_pfn, void *datax)
1910 struct iommu_prepare_data *data;
1912 data = (struct iommu_prepare_data *)datax;
1914 data->ret = iommu_prepare_identity_map(data->pdev,
1915 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1920 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1923 struct iommu_prepare_data data;
1928 for_each_online_node(nid) {
1929 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1936 static void __init iommu_prepare_gfx_mapping(void)
1938 struct pci_dev *pdev = NULL;
1941 for_each_pci_dev(pdev) {
1942 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1943 !IS_GFX_DEVICE(pdev))
1945 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1947 ret = iommu_prepare_with_active_regions(pdev);
1949 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1952 #else /* !CONFIG_DMAR_GFX_WA */
1953 static inline void iommu_prepare_gfx_mapping(void)
1959 #ifdef CONFIG_DMAR_FLOPPY_WA
1960 static inline void iommu_prepare_isa(void)
1962 struct pci_dev *pdev;
1965 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1969 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1970 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1973 printk("IOMMU: Failed to create 0-64M identity map, "
1974 "floppy might not work\n");
1978 static inline void iommu_prepare_isa(void)
1982 #endif /* !CONFIG_DMAR_FLPY_WA */
1984 static int __init init_dmars(void)
1986 struct dmar_drhd_unit *drhd;
1987 struct dmar_rmrr_unit *rmrr;
1988 struct pci_dev *pdev;
1989 struct intel_iommu *iommu;
1990 int i, ret, unit = 0;
1995 * initialize and program root entry to not present
1998 for_each_drhd_unit(drhd) {
2001 * lock not needed as this is only incremented in the single
2002 * threaded kernel __init code path all other access are read
2007 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2010 printk(KERN_ERR "Allocating global iommu array failed\n");
2015 deferred_flush = kzalloc(g_num_of_iommus *
2016 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2017 if (!deferred_flush) {
2023 for_each_drhd_unit(drhd) {
2027 iommu = drhd->iommu;
2028 g_iommus[iommu->seq_id] = iommu;
2030 ret = iommu_init_domains(iommu);
2036 * we could share the same root & context tables
2037 * amoung all IOMMU's. Need to Split it later.
2039 ret = iommu_alloc_root_entry(iommu);
2041 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2046 for_each_drhd_unit(drhd) {
2050 iommu = drhd->iommu;
2051 if (dmar_enable_qi(iommu)) {
2053 * Queued Invalidate not enabled, use Register Based
2056 iommu->flush.flush_context = __iommu_flush_context;
2057 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2058 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2060 (unsigned long long)drhd->reg_base_addr);
2062 iommu->flush.flush_context = qi_flush_context;
2063 iommu->flush.flush_iotlb = qi_flush_iotlb;
2064 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2066 (unsigned long long)drhd->reg_base_addr);
2072 * for each dev attached to rmrr
2074 * locate drhd for dev, alloc domain for dev
2075 * allocate free domain
2076 * allocate page table entries for rmrr
2077 * if context not allocated for bus
2078 * allocate and init context
2079 * set present in root table for this bus
2080 * init context with domain, translation etc
2084 for_each_rmrr_units(rmrr) {
2085 for (i = 0; i < rmrr->devices_cnt; i++) {
2086 pdev = rmrr->devices[i];
2087 /* some BIOS lists non-exist devices in DMAR table */
2090 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2093 "IOMMU: mapping reserved region failed\n");
2097 iommu_prepare_gfx_mapping();
2099 iommu_prepare_isa();
2104 * global invalidate context cache
2105 * global invalidate iotlb
2106 * enable translation
2108 for_each_drhd_unit(drhd) {
2111 iommu = drhd->iommu;
2112 sprintf (iommu->name, "dmar%d", unit++);
2114 iommu_flush_write_buffer(iommu);
2116 ret = dmar_set_interrupt(iommu);
2120 iommu_set_root_entry(iommu);
2122 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2124 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2126 iommu_disable_protect_mem_regions(iommu);
2128 ret = iommu_enable_translation(iommu);
2135 for_each_drhd_unit(drhd) {
2138 iommu = drhd->iommu;
2145 static inline u64 aligned_size(u64 host_addr, size_t size)
2148 addr = (host_addr & (~PAGE_MASK)) + size;
2149 return PAGE_ALIGN(addr);
2153 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2157 /* Make sure it's in range */
2158 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2159 if (!size || (IOVA_START_ADDR + size > end))
2162 piova = alloc_iova(&domain->iovad,
2163 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2167 static struct iova *
2168 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2169 size_t size, u64 dma_mask)
2171 struct pci_dev *pdev = to_pci_dev(dev);
2172 struct iova *iova = NULL;
2174 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2175 iova = iommu_alloc_iova(domain, size, dma_mask);
2178 * First try to allocate an io virtual address in
2179 * DMA_32BIT_MASK and if that fails then try allocating
2182 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2184 iova = iommu_alloc_iova(domain, size, dma_mask);
2188 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2195 static struct dmar_domain *
2196 get_valid_domain_for_dev(struct pci_dev *pdev)
2198 struct dmar_domain *domain;
2201 domain = get_domain_for_dev(pdev,
2202 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2205 "Allocating domain for %s failed", pci_name(pdev));
2209 /* make sure context mapping is ok */
2210 if (unlikely(!domain_context_mapped(pdev))) {
2211 ret = domain_context_mapping(domain, pdev);
2214 "Domain context map for %s failed",
2223 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2224 size_t size, int dir, u64 dma_mask)
2226 struct pci_dev *pdev = to_pci_dev(hwdev);
2227 struct dmar_domain *domain;
2228 phys_addr_t start_paddr;
2232 struct intel_iommu *iommu;
2234 BUG_ON(dir == DMA_NONE);
2235 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2238 domain = get_valid_domain_for_dev(pdev);
2242 iommu = domain_get_iommu(domain);
2243 size = aligned_size((u64)paddr, size);
2245 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2249 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2252 * Check if DMAR supports zero-length reads on write only
2255 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2256 !cap_zlr(iommu->cap))
2257 prot |= DMA_PTE_READ;
2258 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2259 prot |= DMA_PTE_WRITE;
2261 * paddr - (paddr + size) might be partial page, we should map the whole
2262 * page. Note: if two part of one page are separately mapped, we
2263 * might have two guest_addr mapping to the same host paddr, but this
2264 * is not a big problem
2266 ret = domain_page_mapping(domain, start_paddr,
2267 ((u64)paddr) & PAGE_MASK, size, prot);
2271 /* it's a non-present to present mapping */
2272 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2273 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2275 iommu_flush_write_buffer(iommu);
2277 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2281 __free_iova(&domain->iovad, iova);
2282 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2283 pci_name(pdev), size, (unsigned long long)paddr, dir);
2287 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2288 unsigned long offset, size_t size,
2289 enum dma_data_direction dir,
2290 struct dma_attrs *attrs)
2292 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2293 dir, to_pci_dev(dev)->dma_mask);
2296 static void flush_unmaps(void)
2302 /* just flush them all */
2303 for (i = 0; i < g_num_of_iommus; i++) {
2304 struct intel_iommu *iommu = g_iommus[i];
2308 if (deferred_flush[i].next) {
2309 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2310 DMA_TLB_GLOBAL_FLUSH, 0);
2311 for (j = 0; j < deferred_flush[i].next; j++) {
2312 __free_iova(&deferred_flush[i].domain[j]->iovad,
2313 deferred_flush[i].iova[j]);
2315 deferred_flush[i].next = 0;
2322 static void flush_unmaps_timeout(unsigned long data)
2324 unsigned long flags;
2326 spin_lock_irqsave(&async_umap_flush_lock, flags);
2328 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2331 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2333 unsigned long flags;
2335 struct intel_iommu *iommu;
2337 spin_lock_irqsave(&async_umap_flush_lock, flags);
2338 if (list_size == HIGH_WATER_MARK)
2341 iommu = domain_get_iommu(dom);
2342 iommu_id = iommu->seq_id;
2344 next = deferred_flush[iommu_id].next;
2345 deferred_flush[iommu_id].domain[next] = dom;
2346 deferred_flush[iommu_id].iova[next] = iova;
2347 deferred_flush[iommu_id].next++;
2350 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2354 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2357 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2358 size_t size, enum dma_data_direction dir,
2359 struct dma_attrs *attrs)
2361 struct pci_dev *pdev = to_pci_dev(dev);
2362 struct dmar_domain *domain;
2363 unsigned long start_addr;
2365 struct intel_iommu *iommu;
2367 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2369 domain = find_domain(pdev);
2372 iommu = domain_get_iommu(domain);
2374 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2378 start_addr = iova->pfn_lo << PAGE_SHIFT;
2379 size = aligned_size((u64)dev_addr, size);
2381 pr_debug("Device %s unmapping: %lx@%llx\n",
2382 pci_name(pdev), size, (unsigned long long)start_addr);
2384 /* clear the whole page */
2385 dma_pte_clear_range(domain, start_addr, start_addr + size);
2386 /* free page tables */
2387 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2388 if (intel_iommu_strict) {
2389 if (iommu_flush_iotlb_psi(iommu,
2390 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2391 iommu_flush_write_buffer(iommu);
2393 __free_iova(&domain->iovad, iova);
2395 add_unmap(domain, iova);
2397 * queue up the release of the unmap to save the 1/6th of the
2398 * cpu used up by the iotlb flush operation...
2403 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2406 intel_unmap_page(dev, dev_addr, size, dir, NULL);
2409 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2410 dma_addr_t *dma_handle, gfp_t flags)
2415 size = PAGE_ALIGN(size);
2416 order = get_order(size);
2417 flags &= ~(GFP_DMA | GFP_DMA32);
2419 vaddr = (void *)__get_free_pages(flags, order);
2422 memset(vaddr, 0, size);
2424 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2426 hwdev->coherent_dma_mask);
2429 free_pages((unsigned long)vaddr, order);
2433 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2434 dma_addr_t dma_handle)
2438 size = PAGE_ALIGN(size);
2439 order = get_order(size);
2441 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2442 free_pages((unsigned long)vaddr, order);
2445 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2447 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2448 int nelems, enum dma_data_direction dir,
2449 struct dma_attrs *attrs)
2452 struct pci_dev *pdev = to_pci_dev(hwdev);
2453 struct dmar_domain *domain;
2454 unsigned long start_addr;
2458 struct scatterlist *sg;
2459 struct intel_iommu *iommu;
2461 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2464 domain = find_domain(pdev);
2467 iommu = domain_get_iommu(domain);
2469 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2472 for_each_sg(sglist, sg, nelems, i) {
2473 addr = SG_ENT_VIRT_ADDRESS(sg);
2474 size += aligned_size((u64)addr, sg->length);
2477 start_addr = iova->pfn_lo << PAGE_SHIFT;
2479 /* clear the whole page */
2480 dma_pte_clear_range(domain, start_addr, start_addr + size);
2481 /* free page tables */
2482 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2484 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2485 size >> VTD_PAGE_SHIFT, 0))
2486 iommu_flush_write_buffer(iommu);
2489 __free_iova(&domain->iovad, iova);
2492 static int intel_nontranslate_map_sg(struct device *hddev,
2493 struct scatterlist *sglist, int nelems, int dir)
2496 struct scatterlist *sg;
2498 for_each_sg(sglist, sg, nelems, i) {
2499 BUG_ON(!sg_page(sg));
2500 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2501 sg->dma_length = sg->length;
2506 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2507 enum dma_data_direction dir, struct dma_attrs *attrs)
2511 struct pci_dev *pdev = to_pci_dev(hwdev);
2512 struct dmar_domain *domain;
2516 struct iova *iova = NULL;
2518 struct scatterlist *sg;
2519 unsigned long start_addr;
2520 struct intel_iommu *iommu;
2522 BUG_ON(dir == DMA_NONE);
2523 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2524 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2526 domain = get_valid_domain_for_dev(pdev);
2530 iommu = domain_get_iommu(domain);
2532 for_each_sg(sglist, sg, nelems, i) {
2533 addr = SG_ENT_VIRT_ADDRESS(sg);
2534 addr = (void *)virt_to_phys(addr);
2535 size += aligned_size((u64)addr, sg->length);
2538 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2540 sglist->dma_length = 0;
2545 * Check if DMAR supports zero-length reads on write only
2548 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2549 !cap_zlr(iommu->cap))
2550 prot |= DMA_PTE_READ;
2551 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2552 prot |= DMA_PTE_WRITE;
2554 start_addr = iova->pfn_lo << PAGE_SHIFT;
2556 for_each_sg(sglist, sg, nelems, i) {
2557 addr = SG_ENT_VIRT_ADDRESS(sg);
2558 addr = (void *)virt_to_phys(addr);
2559 size = aligned_size((u64)addr, sg->length);
2560 ret = domain_page_mapping(domain, start_addr + offset,
2561 ((u64)addr) & PAGE_MASK,
2564 /* clear the page */
2565 dma_pte_clear_range(domain, start_addr,
2566 start_addr + offset);
2567 /* free page tables */
2568 dma_pte_free_pagetable(domain, start_addr,
2569 start_addr + offset);
2571 __free_iova(&domain->iovad, iova);
2574 sg->dma_address = start_addr + offset +
2575 ((u64)addr & (~PAGE_MASK));
2576 sg->dma_length = sg->length;
2580 /* it's a non-present to present mapping */
2581 if (iommu_flush_iotlb_psi(iommu, domain->id,
2582 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2583 iommu_flush_write_buffer(iommu);
2587 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2592 struct dma_map_ops intel_dma_ops = {
2593 .alloc_coherent = intel_alloc_coherent,
2594 .free_coherent = intel_free_coherent,
2595 .map_sg = intel_map_sg,
2596 .unmap_sg = intel_unmap_sg,
2597 .map_page = intel_map_page,
2598 .unmap_page = intel_unmap_page,
2599 .mapping_error = intel_mapping_error,
2602 static inline int iommu_domain_cache_init(void)
2606 iommu_domain_cache = kmem_cache_create("iommu_domain",
2607 sizeof(struct dmar_domain),
2612 if (!iommu_domain_cache) {
2613 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2620 static inline int iommu_devinfo_cache_init(void)
2624 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2625 sizeof(struct device_domain_info),
2629 if (!iommu_devinfo_cache) {
2630 printk(KERN_ERR "Couldn't create devinfo cache\n");
2637 static inline int iommu_iova_cache_init(void)
2641 iommu_iova_cache = kmem_cache_create("iommu_iova",
2642 sizeof(struct iova),
2646 if (!iommu_iova_cache) {
2647 printk(KERN_ERR "Couldn't create iova cache\n");
2654 static int __init iommu_init_mempool(void)
2657 ret = iommu_iova_cache_init();
2661 ret = iommu_domain_cache_init();
2665 ret = iommu_devinfo_cache_init();
2669 kmem_cache_destroy(iommu_domain_cache);
2671 kmem_cache_destroy(iommu_iova_cache);
2676 static void __init iommu_exit_mempool(void)
2678 kmem_cache_destroy(iommu_devinfo_cache);
2679 kmem_cache_destroy(iommu_domain_cache);
2680 kmem_cache_destroy(iommu_iova_cache);
2684 static void __init init_no_remapping_devices(void)
2686 struct dmar_drhd_unit *drhd;
2688 for_each_drhd_unit(drhd) {
2689 if (!drhd->include_all) {
2691 for (i = 0; i < drhd->devices_cnt; i++)
2692 if (drhd->devices[i] != NULL)
2694 /* ignore DMAR unit if no pci devices exist */
2695 if (i == drhd->devices_cnt)
2703 for_each_drhd_unit(drhd) {
2705 if (drhd->ignored || drhd->include_all)
2708 for (i = 0; i < drhd->devices_cnt; i++)
2709 if (drhd->devices[i] &&
2710 !IS_GFX_DEVICE(drhd->devices[i]))
2713 if (i < drhd->devices_cnt)
2716 /* bypass IOMMU if it is just for gfx devices */
2718 for (i = 0; i < drhd->devices_cnt; i++) {
2719 if (!drhd->devices[i])
2721 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2726 int __init intel_iommu_init(void)
2730 if (dmar_table_init())
2733 if (dmar_dev_scope_init())
2737 * Check the need for DMA-remapping initialization now.
2738 * Above initialization will also be used by Interrupt-remapping.
2740 if (no_iommu || swiotlb || dmar_disabled)
2743 iommu_init_mempool();
2744 dmar_init_reserved_ranges();
2746 init_no_remapping_devices();
2750 printk(KERN_ERR "IOMMU: dmar init failed\n");
2751 put_iova_domain(&reserved_iova_list);
2752 iommu_exit_mempool();
2756 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2758 init_timer(&unmap_timer);
2760 dma_ops = &intel_dma_ops;
2762 register_iommu(&intel_iommu_ops);
2767 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2768 struct pci_dev *pdev)
2770 struct device_domain_info *info;
2771 unsigned long flags;
2773 info = alloc_devinfo_mem();
2777 info->bus = pdev->bus->number;
2778 info->devfn = pdev->devfn;
2780 info->domain = domain;
2782 spin_lock_irqsave(&device_domain_lock, flags);
2783 list_add(&info->link, &domain->devices);
2784 list_add(&info->global, &device_domain_list);
2785 pdev->dev.archdata.iommu = info;
2786 spin_unlock_irqrestore(&device_domain_lock, flags);
2791 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2792 struct pci_dev *pdev)
2794 struct device_domain_info *info;
2795 struct intel_iommu *iommu;
2796 unsigned long flags;
2798 struct list_head *entry, *tmp;
2800 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2804 spin_lock_irqsave(&device_domain_lock, flags);
2805 list_for_each_safe(entry, tmp, &domain->devices) {
2806 info = list_entry(entry, struct device_domain_info, link);
2807 if (info->bus == pdev->bus->number &&
2808 info->devfn == pdev->devfn) {
2809 list_del(&info->link);
2810 list_del(&info->global);
2812 info->dev->dev.archdata.iommu = NULL;
2813 spin_unlock_irqrestore(&device_domain_lock, flags);
2815 iommu_detach_dev(iommu, info->bus, info->devfn);
2816 free_devinfo_mem(info);
2818 spin_lock_irqsave(&device_domain_lock, flags);
2826 /* if there is no other devices under the same iommu
2827 * owned by this domain, clear this iommu in iommu_bmp
2828 * update iommu count and coherency
2830 if (device_to_iommu(info->bus, info->devfn) == iommu)
2835 unsigned long tmp_flags;
2836 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2837 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2838 domain->iommu_count--;
2839 domain_update_iommu_coherency(domain);
2840 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2843 spin_unlock_irqrestore(&device_domain_lock, flags);
2846 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2848 struct device_domain_info *info;
2849 struct intel_iommu *iommu;
2850 unsigned long flags1, flags2;
2852 spin_lock_irqsave(&device_domain_lock, flags1);
2853 while (!list_empty(&domain->devices)) {
2854 info = list_entry(domain->devices.next,
2855 struct device_domain_info, link);
2856 list_del(&info->link);
2857 list_del(&info->global);
2859 info->dev->dev.archdata.iommu = NULL;
2861 spin_unlock_irqrestore(&device_domain_lock, flags1);
2863 iommu = device_to_iommu(info->bus, info->devfn);
2864 iommu_detach_dev(iommu, info->bus, info->devfn);
2866 /* clear this iommu in iommu_bmp, update iommu count
2869 spin_lock_irqsave(&domain->iommu_lock, flags2);
2870 if (test_and_clear_bit(iommu->seq_id,
2871 &domain->iommu_bmp)) {
2872 domain->iommu_count--;
2873 domain_update_iommu_coherency(domain);
2875 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2877 free_devinfo_mem(info);
2878 spin_lock_irqsave(&device_domain_lock, flags1);
2880 spin_unlock_irqrestore(&device_domain_lock, flags1);
2883 /* domain id for virtual machine, it won't be set in context */
2884 static unsigned long vm_domid;
2886 static int vm_domain_min_agaw(struct dmar_domain *domain)
2889 int min_agaw = domain->agaw;
2891 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2892 for (; i < g_num_of_iommus; ) {
2893 if (min_agaw > g_iommus[i]->agaw)
2894 min_agaw = g_iommus[i]->agaw;
2896 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2902 static struct dmar_domain *iommu_alloc_vm_domain(void)
2904 struct dmar_domain *domain;
2906 domain = alloc_domain_mem();
2910 domain->id = vm_domid++;
2911 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2912 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2917 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2921 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2922 spin_lock_init(&domain->mapping_lock);
2923 spin_lock_init(&domain->iommu_lock);
2925 domain_reserve_special_ranges(domain);
2927 /* calculate AGAW */
2928 domain->gaw = guest_width;
2929 adjust_width = guestwidth_to_adjustwidth(guest_width);
2930 domain->agaw = width_to_agaw(adjust_width);
2932 INIT_LIST_HEAD(&domain->devices);
2934 domain->iommu_count = 0;
2935 domain->iommu_coherency = 0;
2936 domain->max_addr = 0;
2938 /* always allocate the top pgd */
2939 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2942 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2946 static void iommu_free_vm_domain(struct dmar_domain *domain)
2948 unsigned long flags;
2949 struct dmar_drhd_unit *drhd;
2950 struct intel_iommu *iommu;
2952 unsigned long ndomains;
2954 for_each_drhd_unit(drhd) {
2957 iommu = drhd->iommu;
2959 ndomains = cap_ndoms(iommu->cap);
2960 i = find_first_bit(iommu->domain_ids, ndomains);
2961 for (; i < ndomains; ) {
2962 if (iommu->domains[i] == domain) {
2963 spin_lock_irqsave(&iommu->lock, flags);
2964 clear_bit(i, iommu->domain_ids);
2965 iommu->domains[i] = NULL;
2966 spin_unlock_irqrestore(&iommu->lock, flags);
2969 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2974 static void vm_domain_exit(struct dmar_domain *domain)
2978 /* Domain 0 is reserved, so dont process it */
2982 vm_domain_remove_all_dev_info(domain);
2984 put_iova_domain(&domain->iovad);
2985 end = DOMAIN_MAX_ADDR(domain->gaw);
2986 end = end & (~VTD_PAGE_MASK);
2989 dma_pte_clear_range(domain, 0, end);
2991 /* free page tables */
2992 dma_pte_free_pagetable(domain, 0, end);
2994 iommu_free_vm_domain(domain);
2995 free_domain_mem(domain);
2998 static int intel_iommu_domain_init(struct iommu_domain *domain)
3000 struct dmar_domain *dmar_domain;
3002 dmar_domain = iommu_alloc_vm_domain();
3005 "intel_iommu_domain_init: dmar_domain == NULL\n");
3008 if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3010 "intel_iommu_domain_init() failed\n");
3011 vm_domain_exit(dmar_domain);
3014 domain->priv = dmar_domain;
3019 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3021 struct dmar_domain *dmar_domain = domain->priv;
3023 domain->priv = NULL;
3024 vm_domain_exit(dmar_domain);
3027 static int intel_iommu_attach_device(struct iommu_domain *domain,
3030 struct dmar_domain *dmar_domain = domain->priv;
3031 struct pci_dev *pdev = to_pci_dev(dev);
3032 struct intel_iommu *iommu;
3037 /* normally pdev is not mapped */
3038 if (unlikely(domain_context_mapped(pdev))) {
3039 struct dmar_domain *old_domain;
3041 old_domain = find_domain(pdev);
3043 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3044 vm_domain_remove_one_dev_info(old_domain, pdev);
3046 domain_remove_dev_info(old_domain);
3050 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3054 /* check if this iommu agaw is sufficient for max mapped address */
3055 addr_width = agaw_to_width(iommu->agaw);
3056 end = DOMAIN_MAX_ADDR(addr_width);
3057 end = end & VTD_PAGE_MASK;
3058 if (end < dmar_domain->max_addr) {
3059 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3060 "sufficient for the mapped address (%llx)\n",
3061 __func__, iommu->agaw, dmar_domain->max_addr);
3065 ret = domain_context_mapping(dmar_domain, pdev);
3069 ret = vm_domain_add_dev_info(dmar_domain, pdev);
3073 static void intel_iommu_detach_device(struct iommu_domain *domain,
3076 struct dmar_domain *dmar_domain = domain->priv;
3077 struct pci_dev *pdev = to_pci_dev(dev);
3079 vm_domain_remove_one_dev_info(dmar_domain, pdev);
3082 static int intel_iommu_map_range(struct iommu_domain *domain,
3083 unsigned long iova, phys_addr_t hpa,
3084 size_t size, int iommu_prot)
3086 struct dmar_domain *dmar_domain = domain->priv;
3092 if (iommu_prot & IOMMU_READ)
3093 prot |= DMA_PTE_READ;
3094 if (iommu_prot & IOMMU_WRITE)
3095 prot |= DMA_PTE_WRITE;
3097 max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3098 if (dmar_domain->max_addr < max_addr) {
3102 /* check if minimum agaw is sufficient for mapped address */
3103 min_agaw = vm_domain_min_agaw(dmar_domain);
3104 addr_width = agaw_to_width(min_agaw);
3105 end = DOMAIN_MAX_ADDR(addr_width);
3106 end = end & VTD_PAGE_MASK;
3107 if (end < max_addr) {
3108 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3109 "sufficient for the mapped address (%llx)\n",
3110 __func__, min_agaw, max_addr);
3113 dmar_domain->max_addr = max_addr;
3116 ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3120 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3121 unsigned long iova, size_t size)
3123 struct dmar_domain *dmar_domain = domain->priv;
3126 /* The address might not be aligned */
3127 base = iova & VTD_PAGE_MASK;
3128 size = VTD_PAGE_ALIGN(size);
3129 dma_pte_clear_range(dmar_domain, base, base + size);
3131 if (dmar_domain->max_addr == base + size)
3132 dmar_domain->max_addr = base;
3135 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3138 struct dmar_domain *dmar_domain = domain->priv;
3139 struct dma_pte *pte;
3142 pte = addr_to_dma_pte(dmar_domain, iova);
3144 phys = dma_pte_addr(pte);
3149 static struct iommu_ops intel_iommu_ops = {
3150 .domain_init = intel_iommu_domain_init,
3151 .domain_destroy = intel_iommu_domain_destroy,
3152 .attach_dev = intel_iommu_attach_device,
3153 .detach_dev = intel_iommu_detach_device,
3154 .map = intel_iommu_map_range,
3155 .unmap = intel_iommu_unmap_range,
3156 .iova_to_phys = intel_iommu_iova_to_phys,
3159 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3162 * Mobile 4 Series Chipset neglects to set RWBF capability,
3165 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3169 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);