2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
42 #define ROOT_SIZE VTD_PAGE_SIZE
43 #define CONTEXT_SIZE VTD_PAGE_SIZE
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48 #define IOAPIC_RANGE_START (0xfee00000)
49 #define IOAPIC_RANGE_END (0xfeefffff)
50 #define IOVA_START_ADDR (0x1000)
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
66 * 12-63: Context Ptr (12 - (haw-1))
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
76 return (root->val & 1);
78 static inline void set_root_present(struct root_entry *root)
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
84 root->val |= value & VTD_PAGE_MASK;
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
90 return (struct context_entry *)
91 (root_present(root)?phys_to_virt(
92 root->val & VTD_PAGE_MASK) :
99 * 1: fault processing disable
100 * 2-3: translation type
101 * 12-63: address space root
107 struct context_entry {
112 static inline bool context_present(struct context_entry *context)
114 return (context->lo & 1);
116 static inline void context_set_present(struct context_entry *context)
121 static inline void context_set_fault_enable(struct context_entry *context)
123 context->lo &= (((u64)-1) << 2) | 1;
126 #define CONTEXT_TT_MULTI_LEVEL 0
128 static inline void context_set_translation_type(struct context_entry *context,
131 context->lo &= (((u64)-1) << 4) | 3;
132 context->lo |= (value & 3) << 2;
135 static inline void context_set_address_root(struct context_entry *context,
138 context->lo |= value & VTD_PAGE_MASK;
141 static inline void context_set_address_width(struct context_entry *context,
144 context->hi |= value & 7;
147 static inline void context_set_domain_id(struct context_entry *context,
150 context->hi |= (value & ((1 << 16) - 1)) << 8;
153 static inline void context_clear_entry(struct context_entry *context)
165 * 12-63: Host physcial address
171 static inline void dma_clear_pte(struct dma_pte *pte)
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
178 pte->val |= DMA_PTE_READ;
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
183 pte->val |= DMA_PTE_WRITE;
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
188 pte->val = (pte->val & ~3) | (prot & 3);
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
193 return (pte->val & VTD_PAGE_MASK);
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
198 pte->val |= (addr & VTD_PAGE_MASK);
201 static inline bool dma_pte_present(struct dma_pte *pte)
203 return (pte->val & 3) != 0;
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
210 int id; /* domain id */
211 struct intel_iommu *iommu; /* back pointer to owning iommu */
213 struct list_head devices; /* all devices' list */
214 struct iova_domain iovad; /* iova's that belong to this domain */
216 struct dma_pte *pgd; /* virtual address */
217 spinlock_t mapping_lock; /* page table lock */
218 int gaw; /* max guest address width */
220 /* adjusted guest address width, 0 is level 2 30-bit */
223 int flags; /* flags to find out type of domain */
226 /* PCI domain-device relationship */
227 struct device_domain_info {
228 struct list_head link; /* link to domain siblings */
229 struct list_head global; /* link to global list */
230 u8 bus; /* PCI bus numer */
231 u8 devfn; /* PCI devfn number */
232 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
233 struct dmar_domain *domain; /* pointer to domain */
236 static void flush_unmaps_timeout(unsigned long data);
238 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
240 #define HIGH_WATER_MARK 250
241 struct deferred_flush_tables {
243 struct iova *iova[HIGH_WATER_MARK];
244 struct dmar_domain *domain[HIGH_WATER_MARK];
247 static struct deferred_flush_tables *deferred_flush;
249 /* bitmap for indexing intel_iommus */
250 static int g_num_of_iommus;
252 static DEFINE_SPINLOCK(async_umap_flush_lock);
253 static LIST_HEAD(unmaps_to_do);
256 static long list_size;
258 static void domain_remove_dev_info(struct dmar_domain *domain);
261 static int __initdata dmar_map_gfx = 1;
262 static int dmar_forcedac;
263 static int intel_iommu_strict;
265 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
266 static DEFINE_SPINLOCK(device_domain_lock);
267 static LIST_HEAD(device_domain_list);
269 static int __init intel_iommu_setup(char *str)
274 if (!strncmp(str, "off", 3)) {
276 printk(KERN_INFO"Intel-IOMMU: disabled\n");
277 } else if (!strncmp(str, "igfx_off", 8)) {
280 "Intel-IOMMU: disable GFX device mapping\n");
281 } else if (!strncmp(str, "forcedac", 8)) {
283 "Intel-IOMMU: Forcing DAC for PCI devices\n");
285 } else if (!strncmp(str, "strict", 6)) {
287 "Intel-IOMMU: disable batched IOTLB flush\n");
288 intel_iommu_strict = 1;
291 str += strcspn(str, ",");
297 __setup("intel_iommu=", intel_iommu_setup);
299 static struct kmem_cache *iommu_domain_cache;
300 static struct kmem_cache *iommu_devinfo_cache;
301 static struct kmem_cache *iommu_iova_cache;
303 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
308 /* trying to avoid low memory issues */
309 flags = current->flags & PF_MEMALLOC;
310 current->flags |= PF_MEMALLOC;
311 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
312 current->flags &= (~PF_MEMALLOC | flags);
317 static inline void *alloc_pgtable_page(void)
322 /* trying to avoid low memory issues */
323 flags = current->flags & PF_MEMALLOC;
324 current->flags |= PF_MEMALLOC;
325 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
326 current->flags &= (~PF_MEMALLOC | flags);
330 static inline void free_pgtable_page(void *vaddr)
332 free_page((unsigned long)vaddr);
335 static inline void *alloc_domain_mem(void)
337 return iommu_kmem_cache_alloc(iommu_domain_cache);
340 static void free_domain_mem(void *vaddr)
342 kmem_cache_free(iommu_domain_cache, vaddr);
345 static inline void * alloc_devinfo_mem(void)
347 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
350 static inline void free_devinfo_mem(void *vaddr)
352 kmem_cache_free(iommu_devinfo_cache, vaddr);
355 struct iova *alloc_iova_mem(void)
357 return iommu_kmem_cache_alloc(iommu_iova_cache);
360 void free_iova_mem(struct iova *iova)
362 kmem_cache_free(iommu_iova_cache, iova);
365 /* Gets context entry for a given bus and devfn */
366 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
369 struct root_entry *root;
370 struct context_entry *context;
371 unsigned long phy_addr;
374 spin_lock_irqsave(&iommu->lock, flags);
375 root = &iommu->root_entry[bus];
376 context = get_context_addr_from_root(root);
378 context = (struct context_entry *)alloc_pgtable_page();
380 spin_unlock_irqrestore(&iommu->lock, flags);
383 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
384 phy_addr = virt_to_phys((void *)context);
385 set_root_value(root, phy_addr);
386 set_root_present(root);
387 __iommu_flush_cache(iommu, root, sizeof(*root));
389 spin_unlock_irqrestore(&iommu->lock, flags);
390 return &context[devfn];
393 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
395 struct root_entry *root;
396 struct context_entry *context;
400 spin_lock_irqsave(&iommu->lock, flags);
401 root = &iommu->root_entry[bus];
402 context = get_context_addr_from_root(root);
407 ret = context_present(&context[devfn]);
409 spin_unlock_irqrestore(&iommu->lock, flags);
413 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
415 struct root_entry *root;
416 struct context_entry *context;
419 spin_lock_irqsave(&iommu->lock, flags);
420 root = &iommu->root_entry[bus];
421 context = get_context_addr_from_root(root);
423 context_clear_entry(&context[devfn]);
424 __iommu_flush_cache(iommu, &context[devfn], \
427 spin_unlock_irqrestore(&iommu->lock, flags);
430 static void free_context_table(struct intel_iommu *iommu)
432 struct root_entry *root;
435 struct context_entry *context;
437 spin_lock_irqsave(&iommu->lock, flags);
438 if (!iommu->root_entry) {
441 for (i = 0; i < ROOT_ENTRY_NR; i++) {
442 root = &iommu->root_entry[i];
443 context = get_context_addr_from_root(root);
445 free_pgtable_page(context);
447 free_pgtable_page(iommu->root_entry);
448 iommu->root_entry = NULL;
450 spin_unlock_irqrestore(&iommu->lock, flags);
453 /* page table handling */
454 #define LEVEL_STRIDE (9)
455 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
457 static inline int agaw_to_level(int agaw)
462 static inline int agaw_to_width(int agaw)
464 return 30 + agaw * LEVEL_STRIDE;
468 static inline int width_to_agaw(int width)
470 return (width - 30) / LEVEL_STRIDE;
473 static inline unsigned int level_to_offset_bits(int level)
475 return (12 + (level - 1) * LEVEL_STRIDE);
478 static inline int address_level_offset(u64 addr, int level)
480 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
483 static inline u64 level_mask(int level)
485 return ((u64)-1 << level_to_offset_bits(level));
488 static inline u64 level_size(int level)
490 return ((u64)1 << level_to_offset_bits(level));
493 static inline u64 align_to_level(u64 addr, int level)
495 return ((addr + level_size(level) - 1) & level_mask(level));
498 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
500 int addr_width = agaw_to_width(domain->agaw);
501 struct dma_pte *parent, *pte = NULL;
502 int level = agaw_to_level(domain->agaw);
506 BUG_ON(!domain->pgd);
508 addr &= (((u64)1) << addr_width) - 1;
509 parent = domain->pgd;
511 spin_lock_irqsave(&domain->mapping_lock, flags);
515 offset = address_level_offset(addr, level);
516 pte = &parent[offset];
520 if (!dma_pte_present(pte)) {
521 tmp_page = alloc_pgtable_page();
524 spin_unlock_irqrestore(&domain->mapping_lock,
528 __iommu_flush_cache(domain->iommu, tmp_page,
530 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
532 * high level table always sets r/w, last level page
533 * table control read/write
535 dma_set_pte_readable(pte);
536 dma_set_pte_writable(pte);
537 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
539 parent = phys_to_virt(dma_pte_addr(pte));
543 spin_unlock_irqrestore(&domain->mapping_lock, flags);
547 /* return address's pte at specific level */
548 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
551 struct dma_pte *parent, *pte = NULL;
552 int total = agaw_to_level(domain->agaw);
555 parent = domain->pgd;
556 while (level <= total) {
557 offset = address_level_offset(addr, total);
558 pte = &parent[offset];
562 if (!dma_pte_present(pte))
564 parent = phys_to_virt(dma_pte_addr(pte));
570 /* clear one page's page table */
571 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
573 struct dma_pte *pte = NULL;
575 /* get last level pte */
576 pte = dma_addr_level_pte(domain, addr, 1);
580 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
584 /* clear last level pte, a tlb flush should be followed */
585 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
587 int addr_width = agaw_to_width(domain->agaw);
589 start &= (((u64)1) << addr_width) - 1;
590 end &= (((u64)1) << addr_width) - 1;
591 /* in case it's partial page */
592 start = PAGE_ALIGN(start);
595 /* we don't need lock here, nobody else touches the iova range */
596 while (start < end) {
597 dma_pte_clear_one(domain, start);
598 start += VTD_PAGE_SIZE;
602 /* free page table pages. last level pte should already be cleared */
603 static void dma_pte_free_pagetable(struct dmar_domain *domain,
606 int addr_width = agaw_to_width(domain->agaw);
608 int total = agaw_to_level(domain->agaw);
612 start &= (((u64)1) << addr_width) - 1;
613 end &= (((u64)1) << addr_width) - 1;
615 /* we don't need lock here, nobody else touches the iova range */
617 while (level <= total) {
618 tmp = align_to_level(start, level);
619 if (tmp >= end || (tmp + level_size(level) > end))
623 pte = dma_addr_level_pte(domain, tmp, level);
626 phys_to_virt(dma_pte_addr(pte)));
628 __iommu_flush_cache(domain->iommu,
631 tmp += level_size(level);
636 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
637 free_pgtable_page(domain->pgd);
643 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
645 struct root_entry *root;
648 root = (struct root_entry *)alloc_pgtable_page();
652 __iommu_flush_cache(iommu, root, ROOT_SIZE);
654 spin_lock_irqsave(&iommu->lock, flags);
655 iommu->root_entry = root;
656 spin_unlock_irqrestore(&iommu->lock, flags);
661 static void iommu_set_root_entry(struct intel_iommu *iommu)
667 addr = iommu->root_entry;
669 spin_lock_irqsave(&iommu->register_lock, flag);
670 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
672 cmd = iommu->gcmd | DMA_GCMD_SRTP;
673 writel(cmd, iommu->reg + DMAR_GCMD_REG);
675 /* Make sure hardware complete it */
676 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
677 readl, (sts & DMA_GSTS_RTPS), sts);
679 spin_unlock_irqrestore(&iommu->register_lock, flag);
682 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
687 if (!cap_rwbf(iommu->cap))
689 val = iommu->gcmd | DMA_GCMD_WBF;
691 spin_lock_irqsave(&iommu->register_lock, flag);
692 writel(val, iommu->reg + DMAR_GCMD_REG);
694 /* Make sure hardware complete it */
695 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
696 readl, (!(val & DMA_GSTS_WBFS)), val);
698 spin_unlock_irqrestore(&iommu->register_lock, flag);
701 /* return value determine if we need a write buffer flush */
702 static int __iommu_flush_context(struct intel_iommu *iommu,
703 u16 did, u16 source_id, u8 function_mask, u64 type,
704 int non_present_entry_flush)
710 * In the non-present entry flush case, if hardware doesn't cache
711 * non-present entry we do nothing and if hardware cache non-present
712 * entry, we flush entries of domain 0 (the domain id is used to cache
713 * any non-present entries)
715 if (non_present_entry_flush) {
716 if (!cap_caching_mode(iommu->cap))
723 case DMA_CCMD_GLOBAL_INVL:
724 val = DMA_CCMD_GLOBAL_INVL;
726 case DMA_CCMD_DOMAIN_INVL:
727 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
729 case DMA_CCMD_DEVICE_INVL:
730 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
731 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
738 spin_lock_irqsave(&iommu->register_lock, flag);
739 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
741 /* Make sure hardware complete it */
742 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
743 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
745 spin_unlock_irqrestore(&iommu->register_lock, flag);
747 /* flush context entry will implicitly flush write buffer */
751 /* return value determine if we need a write buffer flush */
752 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
753 u64 addr, unsigned int size_order, u64 type,
754 int non_present_entry_flush)
756 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
757 u64 val = 0, val_iva = 0;
761 * In the non-present entry flush case, if hardware doesn't cache
762 * non-present entry we do nothing and if hardware cache non-present
763 * entry, we flush entries of domain 0 (the domain id is used to cache
764 * any non-present entries)
766 if (non_present_entry_flush) {
767 if (!cap_caching_mode(iommu->cap))
774 case DMA_TLB_GLOBAL_FLUSH:
775 /* global flush doesn't need set IVA_REG */
776 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
778 case DMA_TLB_DSI_FLUSH:
779 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
781 case DMA_TLB_PSI_FLUSH:
782 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
783 /* Note: always flush non-leaf currently */
784 val_iva = size_order | addr;
789 /* Note: set drain read/write */
792 * This is probably to be super secure.. Looks like we can
793 * ignore it without any impact.
795 if (cap_read_drain(iommu->cap))
796 val |= DMA_TLB_READ_DRAIN;
798 if (cap_write_drain(iommu->cap))
799 val |= DMA_TLB_WRITE_DRAIN;
801 spin_lock_irqsave(&iommu->register_lock, flag);
802 /* Note: Only uses first TLB reg currently */
804 dmar_writeq(iommu->reg + tlb_offset, val_iva);
805 dmar_writeq(iommu->reg + tlb_offset + 8, val);
807 /* Make sure hardware complete it */
808 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
809 dmar_readq, (!(val & DMA_TLB_IVT)), val);
811 spin_unlock_irqrestore(&iommu->register_lock, flag);
813 /* check IOTLB invalidation granularity */
814 if (DMA_TLB_IAIG(val) == 0)
815 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
816 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
817 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
818 (unsigned long long)DMA_TLB_IIRG(type),
819 (unsigned long long)DMA_TLB_IAIG(val));
820 /* flush iotlb entry will implicitly flush write buffer */
824 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
825 u64 addr, unsigned int pages, int non_present_entry_flush)
829 BUG_ON(addr & (~VTD_PAGE_MASK));
832 /* Fallback to domain selective flush if no PSI support */
833 if (!cap_pgsel_inv(iommu->cap))
834 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
836 non_present_entry_flush);
839 * PSI requires page size to be 2 ^ x, and the base address is naturally
840 * aligned to the size
842 mask = ilog2(__roundup_pow_of_two(pages));
843 /* Fallback to domain selective flush if size is too big */
844 if (mask > cap_max_amask_val(iommu->cap))
845 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
846 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
848 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
850 non_present_entry_flush);
853 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
858 spin_lock_irqsave(&iommu->register_lock, flags);
859 pmen = readl(iommu->reg + DMAR_PMEN_REG);
860 pmen &= ~DMA_PMEN_EPM;
861 writel(pmen, iommu->reg + DMAR_PMEN_REG);
863 /* wait for the protected region status bit to clear */
864 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
865 readl, !(pmen & DMA_PMEN_PRS), pmen);
867 spin_unlock_irqrestore(&iommu->register_lock, flags);
870 static int iommu_enable_translation(struct intel_iommu *iommu)
875 spin_lock_irqsave(&iommu->register_lock, flags);
876 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
878 /* Make sure hardware complete it */
879 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
880 readl, (sts & DMA_GSTS_TES), sts);
882 iommu->gcmd |= DMA_GCMD_TE;
883 spin_unlock_irqrestore(&iommu->register_lock, flags);
887 static int iommu_disable_translation(struct intel_iommu *iommu)
892 spin_lock_irqsave(&iommu->register_lock, flag);
893 iommu->gcmd &= ~DMA_GCMD_TE;
894 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
896 /* Make sure hardware complete it */
897 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
898 readl, (!(sts & DMA_GSTS_TES)), sts);
900 spin_unlock_irqrestore(&iommu->register_lock, flag);
904 /* iommu interrupt handling. Most stuff are MSI-like. */
906 static const char *fault_reason_strings[] =
909 "Present bit in root entry is clear",
910 "Present bit in context entry is clear",
911 "Invalid context entry",
912 "Access beyond MGAW",
913 "PTE Write access is not set",
914 "PTE Read access is not set",
915 "Next page table ptr is invalid",
916 "Root table address invalid",
917 "Context table ptr is invalid",
918 "non-zero reserved fields in RTP",
919 "non-zero reserved fields in CTP",
920 "non-zero reserved fields in PTE",
922 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
924 const char *dmar_get_fault_reason(u8 fault_reason)
926 if (fault_reason > MAX_FAULT_REASON_IDX)
929 return fault_reason_strings[fault_reason];
932 void dmar_msi_unmask(unsigned int irq)
934 struct intel_iommu *iommu = get_irq_data(irq);
938 spin_lock_irqsave(&iommu->register_lock, flag);
939 writel(0, iommu->reg + DMAR_FECTL_REG);
940 /* Read a reg to force flush the post write */
941 readl(iommu->reg + DMAR_FECTL_REG);
942 spin_unlock_irqrestore(&iommu->register_lock, flag);
945 void dmar_msi_mask(unsigned int irq)
948 struct intel_iommu *iommu = get_irq_data(irq);
951 spin_lock_irqsave(&iommu->register_lock, flag);
952 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
953 /* Read a reg to force flush the post write */
954 readl(iommu->reg + DMAR_FECTL_REG);
955 spin_unlock_irqrestore(&iommu->register_lock, flag);
958 void dmar_msi_write(int irq, struct msi_msg *msg)
960 struct intel_iommu *iommu = get_irq_data(irq);
963 spin_lock_irqsave(&iommu->register_lock, flag);
964 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
965 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
966 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
967 spin_unlock_irqrestore(&iommu->register_lock, flag);
970 void dmar_msi_read(int irq, struct msi_msg *msg)
972 struct intel_iommu *iommu = get_irq_data(irq);
975 spin_lock_irqsave(&iommu->register_lock, flag);
976 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
977 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
978 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
979 spin_unlock_irqrestore(&iommu->register_lock, flag);
982 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
983 u8 fault_reason, u16 source_id, unsigned long long addr)
987 reason = dmar_get_fault_reason(fault_reason);
990 "DMAR:[%s] Request device [%02x:%02x.%d] "
992 "DMAR:[fault reason %02d] %s\n",
993 (type ? "DMA Read" : "DMA Write"),
994 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
995 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
999 #define PRIMARY_FAULT_REG_LEN (16)
1000 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1002 struct intel_iommu *iommu = dev_id;
1003 int reg, fault_index;
1007 spin_lock_irqsave(&iommu->register_lock, flag);
1008 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1010 /* TBD: ignore advanced fault log currently */
1011 if (!(fault_status & DMA_FSTS_PPF))
1012 goto clear_overflow;
1014 fault_index = dma_fsts_fault_record_index(fault_status);
1015 reg = cap_fault_reg_offset(iommu->cap);
1023 /* highest 32 bits */
1024 data = readl(iommu->reg + reg +
1025 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1026 if (!(data & DMA_FRCD_F))
1029 fault_reason = dma_frcd_fault_reason(data);
1030 type = dma_frcd_type(data);
1032 data = readl(iommu->reg + reg +
1033 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1034 source_id = dma_frcd_source_id(data);
1036 guest_addr = dmar_readq(iommu->reg + reg +
1037 fault_index * PRIMARY_FAULT_REG_LEN);
1038 guest_addr = dma_frcd_page_addr(guest_addr);
1039 /* clear the fault */
1040 writel(DMA_FRCD_F, iommu->reg + reg +
1041 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1043 spin_unlock_irqrestore(&iommu->register_lock, flag);
1045 iommu_page_fault_do_one(iommu, type, fault_reason,
1046 source_id, guest_addr);
1049 if (fault_index > cap_num_fault_regs(iommu->cap))
1051 spin_lock_irqsave(&iommu->register_lock, flag);
1054 /* clear primary fault overflow */
1055 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1056 if (fault_status & DMA_FSTS_PFO)
1057 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1059 spin_unlock_irqrestore(&iommu->register_lock, flag);
1063 int dmar_set_interrupt(struct intel_iommu *iommu)
1069 printk(KERN_ERR "IOMMU: no free vectors\n");
1073 set_irq_data(irq, iommu);
1076 ret = arch_setup_dmar_msi(irq);
1078 set_irq_data(irq, NULL);
1084 /* Force fault register is cleared */
1085 iommu_page_fault(irq, iommu);
1087 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1089 printk(KERN_ERR "IOMMU: can't request irq\n");
1093 static int iommu_init_domains(struct intel_iommu *iommu)
1095 unsigned long ndomains;
1096 unsigned long nlongs;
1098 ndomains = cap_ndoms(iommu->cap);
1099 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1100 nlongs = BITS_TO_LONGS(ndomains);
1102 /* TBD: there might be 64K domains,
1103 * consider other allocation for future chip
1105 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1106 if (!iommu->domain_ids) {
1107 printk(KERN_ERR "Allocating domain id array failed\n");
1110 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1112 if (!iommu->domains) {
1113 printk(KERN_ERR "Allocating domain array failed\n");
1114 kfree(iommu->domain_ids);
1118 spin_lock_init(&iommu->lock);
1121 * if Caching mode is set, then invalid translations are tagged
1122 * with domainid 0. Hence we need to pre-allocate it.
1124 if (cap_caching_mode(iommu->cap))
1125 set_bit(0, iommu->domain_ids);
1130 static void domain_exit(struct dmar_domain *domain);
1132 void free_dmar_iommu(struct intel_iommu *iommu)
1134 struct dmar_domain *domain;
1137 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1138 for (; i < cap_ndoms(iommu->cap); ) {
1139 domain = iommu->domains[i];
1140 clear_bit(i, iommu->domain_ids);
1141 domain_exit(domain);
1142 i = find_next_bit(iommu->domain_ids,
1143 cap_ndoms(iommu->cap), i+1);
1146 if (iommu->gcmd & DMA_GCMD_TE)
1147 iommu_disable_translation(iommu);
1150 set_irq_data(iommu->irq, NULL);
1151 /* This will mask the irq */
1152 free_irq(iommu->irq, iommu);
1153 destroy_irq(iommu->irq);
1156 kfree(iommu->domains);
1157 kfree(iommu->domain_ids);
1159 g_iommus[iommu->seq_id] = NULL;
1161 /* if all iommus are freed, free g_iommus */
1162 for (i = 0; i < g_num_of_iommus; i++) {
1167 if (i == g_num_of_iommus)
1170 /* free context mapping */
1171 free_context_table(iommu);
1174 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1177 unsigned long ndomains;
1178 struct dmar_domain *domain;
1179 unsigned long flags;
1181 domain = alloc_domain_mem();
1185 ndomains = cap_ndoms(iommu->cap);
1187 spin_lock_irqsave(&iommu->lock, flags);
1188 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1189 if (num >= ndomains) {
1190 spin_unlock_irqrestore(&iommu->lock, flags);
1191 free_domain_mem(domain);
1192 printk(KERN_ERR "IOMMU: no free domain ids\n");
1196 set_bit(num, iommu->domain_ids);
1198 domain->iommu = iommu;
1200 iommu->domains[num] = domain;
1201 spin_unlock_irqrestore(&iommu->lock, flags);
1206 static void iommu_free_domain(struct dmar_domain *domain)
1208 unsigned long flags;
1210 spin_lock_irqsave(&domain->iommu->lock, flags);
1211 clear_bit(domain->id, domain->iommu->domain_ids);
1212 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1215 static struct iova_domain reserved_iova_list;
1216 static struct lock_class_key reserved_alloc_key;
1217 static struct lock_class_key reserved_rbtree_key;
1219 static void dmar_init_reserved_ranges(void)
1221 struct pci_dev *pdev = NULL;
1226 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1228 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1229 &reserved_alloc_key);
1230 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1231 &reserved_rbtree_key);
1233 /* IOAPIC ranges shouldn't be accessed by DMA */
1234 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1235 IOVA_PFN(IOAPIC_RANGE_END));
1237 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1239 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1240 for_each_pci_dev(pdev) {
1243 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1244 r = &pdev->resource[i];
1245 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1249 size = r->end - addr;
1250 size = PAGE_ALIGN(size);
1251 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1252 IOVA_PFN(size + addr) - 1);
1254 printk(KERN_ERR "Reserve iova failed\n");
1260 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1262 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1265 static inline int guestwidth_to_adjustwidth(int gaw)
1268 int r = (gaw - 12) % 9;
1279 static int domain_init(struct dmar_domain *domain, int guest_width)
1281 struct intel_iommu *iommu;
1282 int adjust_width, agaw;
1283 unsigned long sagaw;
1285 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1286 spin_lock_init(&domain->mapping_lock);
1288 domain_reserve_special_ranges(domain);
1290 /* calculate AGAW */
1291 iommu = domain->iommu;
1292 if (guest_width > cap_mgaw(iommu->cap))
1293 guest_width = cap_mgaw(iommu->cap);
1294 domain->gaw = guest_width;
1295 adjust_width = guestwidth_to_adjustwidth(guest_width);
1296 agaw = width_to_agaw(adjust_width);
1297 sagaw = cap_sagaw(iommu->cap);
1298 if (!test_bit(agaw, &sagaw)) {
1299 /* hardware doesn't support it, choose a bigger one */
1300 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1301 agaw = find_next_bit(&sagaw, 5, agaw);
1305 domain->agaw = agaw;
1306 INIT_LIST_HEAD(&domain->devices);
1308 /* always allocate the top pgd */
1309 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1312 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1316 static void domain_exit(struct dmar_domain *domain)
1320 /* Domain 0 is reserved, so dont process it */
1324 domain_remove_dev_info(domain);
1326 put_iova_domain(&domain->iovad);
1327 end = DOMAIN_MAX_ADDR(domain->gaw);
1328 end = end & (~PAGE_MASK);
1331 dma_pte_clear_range(domain, 0, end);
1333 /* free page tables */
1334 dma_pte_free_pagetable(domain, 0, end);
1336 iommu_free_domain(domain);
1337 free_domain_mem(domain);
1340 static int domain_context_mapping_one(struct dmar_domain *domain,
1343 struct context_entry *context;
1344 struct intel_iommu *iommu = domain->iommu;
1345 unsigned long flags;
1347 pr_debug("Set context mapping for %02x:%02x.%d\n",
1348 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1349 BUG_ON(!domain->pgd);
1350 context = device_to_context_entry(iommu, bus, devfn);
1353 spin_lock_irqsave(&iommu->lock, flags);
1354 if (context_present(context)) {
1355 spin_unlock_irqrestore(&iommu->lock, flags);
1359 context_set_domain_id(context, domain->id);
1360 context_set_address_width(context, domain->agaw);
1361 context_set_address_root(context, virt_to_phys(domain->pgd));
1362 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1363 context_set_fault_enable(context);
1364 context_set_present(context);
1365 __iommu_flush_cache(iommu, context, sizeof(*context));
1367 /* it's a non-present to present mapping */
1368 if (iommu->flush.flush_context(iommu, domain->id,
1369 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1370 DMA_CCMD_DEVICE_INVL, 1))
1371 iommu_flush_write_buffer(iommu);
1373 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1375 spin_unlock_irqrestore(&iommu->lock, flags);
1380 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1383 struct pci_dev *tmp, *parent;
1385 ret = domain_context_mapping_one(domain, pdev->bus->number,
1390 /* dependent device mapping */
1391 tmp = pci_find_upstream_pcie_bridge(pdev);
1394 /* Secondary interface's bus number and devfn 0 */
1395 parent = pdev->bus->self;
1396 while (parent != tmp) {
1397 ret = domain_context_mapping_one(domain, parent->bus->number,
1401 parent = parent->bus->self;
1403 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1404 return domain_context_mapping_one(domain,
1405 tmp->subordinate->number, 0);
1406 else /* this is a legacy PCI bridge */
1407 return domain_context_mapping_one(domain,
1408 tmp->bus->number, tmp->devfn);
1411 static int domain_context_mapped(struct dmar_domain *domain,
1412 struct pci_dev *pdev)
1415 struct pci_dev *tmp, *parent;
1417 ret = device_context_mapped(domain->iommu,
1418 pdev->bus->number, pdev->devfn);
1421 /* dependent device mapping */
1422 tmp = pci_find_upstream_pcie_bridge(pdev);
1425 /* Secondary interface's bus number and devfn 0 */
1426 parent = pdev->bus->self;
1427 while (parent != tmp) {
1428 ret = device_context_mapped(domain->iommu, parent->bus->number,
1432 parent = parent->bus->self;
1435 return device_context_mapped(domain->iommu,
1436 tmp->subordinate->number, 0);
1438 return device_context_mapped(domain->iommu,
1439 tmp->bus->number, tmp->devfn);
1443 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1444 u64 hpa, size_t size, int prot)
1446 u64 start_pfn, end_pfn;
1447 struct dma_pte *pte;
1449 int addr_width = agaw_to_width(domain->agaw);
1451 hpa &= (((u64)1) << addr_width) - 1;
1453 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1456 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1457 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1459 while (start_pfn < end_pfn) {
1460 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1463 /* We don't need lock here, nobody else
1464 * touches the iova range
1466 BUG_ON(dma_pte_addr(pte));
1467 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1468 dma_set_pte_prot(pte, prot);
1469 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1476 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1478 clear_context_table(domain->iommu, bus, devfn);
1479 domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1480 DMA_CCMD_GLOBAL_INVL, 0);
1481 domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1482 DMA_TLB_GLOBAL_FLUSH, 0);
1485 static void domain_remove_dev_info(struct dmar_domain *domain)
1487 struct device_domain_info *info;
1488 unsigned long flags;
1490 spin_lock_irqsave(&device_domain_lock, flags);
1491 while (!list_empty(&domain->devices)) {
1492 info = list_entry(domain->devices.next,
1493 struct device_domain_info, link);
1494 list_del(&info->link);
1495 list_del(&info->global);
1497 info->dev->dev.archdata.iommu = NULL;
1498 spin_unlock_irqrestore(&device_domain_lock, flags);
1500 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1501 free_devinfo_mem(info);
1503 spin_lock_irqsave(&device_domain_lock, flags);
1505 spin_unlock_irqrestore(&device_domain_lock, flags);
1510 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1512 static struct dmar_domain *
1513 find_domain(struct pci_dev *pdev)
1515 struct device_domain_info *info;
1517 /* No lock here, assumes no domain exit in normal case */
1518 info = pdev->dev.archdata.iommu;
1520 return info->domain;
1524 /* domain is initialized */
1525 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1527 struct dmar_domain *domain, *found = NULL;
1528 struct intel_iommu *iommu;
1529 struct dmar_drhd_unit *drhd;
1530 struct device_domain_info *info, *tmp;
1531 struct pci_dev *dev_tmp;
1532 unsigned long flags;
1533 int bus = 0, devfn = 0;
1535 domain = find_domain(pdev);
1539 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1541 if (dev_tmp->is_pcie) {
1542 bus = dev_tmp->subordinate->number;
1545 bus = dev_tmp->bus->number;
1546 devfn = dev_tmp->devfn;
1548 spin_lock_irqsave(&device_domain_lock, flags);
1549 list_for_each_entry(info, &device_domain_list, global) {
1550 if (info->bus == bus && info->devfn == devfn) {
1551 found = info->domain;
1555 spin_unlock_irqrestore(&device_domain_lock, flags);
1556 /* pcie-pci bridge already has a domain, uses it */
1563 /* Allocate new domain for the device */
1564 drhd = dmar_find_matched_drhd_unit(pdev);
1566 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1570 iommu = drhd->iommu;
1572 domain = iommu_alloc_domain(iommu);
1576 if (domain_init(domain, gaw)) {
1577 domain_exit(domain);
1581 /* register pcie-to-pci device */
1583 info = alloc_devinfo_mem();
1585 domain_exit(domain);
1589 info->devfn = devfn;
1591 info->domain = domain;
1592 /* This domain is shared by devices under p2p bridge */
1593 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1595 /* pcie-to-pci bridge already has a domain, uses it */
1597 spin_lock_irqsave(&device_domain_lock, flags);
1598 list_for_each_entry(tmp, &device_domain_list, global) {
1599 if (tmp->bus == bus && tmp->devfn == devfn) {
1600 found = tmp->domain;
1605 free_devinfo_mem(info);
1606 domain_exit(domain);
1609 list_add(&info->link, &domain->devices);
1610 list_add(&info->global, &device_domain_list);
1612 spin_unlock_irqrestore(&device_domain_lock, flags);
1616 info = alloc_devinfo_mem();
1619 info->bus = pdev->bus->number;
1620 info->devfn = pdev->devfn;
1622 info->domain = domain;
1623 spin_lock_irqsave(&device_domain_lock, flags);
1624 /* somebody is fast */
1625 found = find_domain(pdev);
1626 if (found != NULL) {
1627 spin_unlock_irqrestore(&device_domain_lock, flags);
1628 if (found != domain) {
1629 domain_exit(domain);
1632 free_devinfo_mem(info);
1635 list_add(&info->link, &domain->devices);
1636 list_add(&info->global, &device_domain_list);
1637 pdev->dev.archdata.iommu = info;
1638 spin_unlock_irqrestore(&device_domain_lock, flags);
1641 /* recheck it here, maybe others set it */
1642 return find_domain(pdev);
1645 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1646 unsigned long long start,
1647 unsigned long long end)
1649 struct dmar_domain *domain;
1651 unsigned long long base;
1655 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1656 pci_name(pdev), start, end);
1657 /* page table init */
1658 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1662 /* The address might not be aligned */
1663 base = start & PAGE_MASK;
1665 size = PAGE_ALIGN(size);
1666 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1667 IOVA_PFN(base + size) - 1)) {
1668 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1673 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1674 size, base, pci_name(pdev));
1676 * RMRR range might have overlap with physical memory range,
1679 dma_pte_clear_range(domain, base, base + size);
1681 ret = domain_page_mapping(domain, base, base, size,
1682 DMA_PTE_READ|DMA_PTE_WRITE);
1686 /* context entry init */
1687 ret = domain_context_mapping(domain, pdev);
1691 domain_exit(domain);
1696 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1697 struct pci_dev *pdev)
1699 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1701 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1702 rmrr->end_address + 1);
1705 #ifdef CONFIG_DMAR_GFX_WA
1706 struct iommu_prepare_data {
1707 struct pci_dev *pdev;
1711 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1712 unsigned long end_pfn, void *datax)
1714 struct iommu_prepare_data *data;
1716 data = (struct iommu_prepare_data *)datax;
1718 data->ret = iommu_prepare_identity_map(data->pdev,
1719 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1724 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1727 struct iommu_prepare_data data;
1732 for_each_online_node(nid) {
1733 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1740 static void __init iommu_prepare_gfx_mapping(void)
1742 struct pci_dev *pdev = NULL;
1745 for_each_pci_dev(pdev) {
1746 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1747 !IS_GFX_DEVICE(pdev))
1749 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1751 ret = iommu_prepare_with_active_regions(pdev);
1753 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1756 #else /* !CONFIG_DMAR_GFX_WA */
1757 static inline void iommu_prepare_gfx_mapping(void)
1763 #ifdef CONFIG_DMAR_FLOPPY_WA
1764 static inline void iommu_prepare_isa(void)
1766 struct pci_dev *pdev;
1769 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1773 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1774 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1777 printk("IOMMU: Failed to create 0-64M identity map, "
1778 "floppy might not work\n");
1782 static inline void iommu_prepare_isa(void)
1786 #endif /* !CONFIG_DMAR_FLPY_WA */
1788 static int __init init_dmars(void)
1790 struct dmar_drhd_unit *drhd;
1791 struct dmar_rmrr_unit *rmrr;
1792 struct pci_dev *pdev;
1793 struct intel_iommu *iommu;
1794 int i, ret, unit = 0;
1799 * initialize and program root entry to not present
1802 for_each_drhd_unit(drhd) {
1805 * lock not needed as this is only incremented in the single
1806 * threaded kernel __init code path all other access are read
1811 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1814 printk(KERN_ERR "Allocating global iommu array failed\n");
1819 deferred_flush = kzalloc(g_num_of_iommus *
1820 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1821 if (!deferred_flush) {
1827 for_each_drhd_unit(drhd) {
1831 iommu = drhd->iommu;
1832 g_iommus[iommu->seq_id] = iommu;
1834 ret = iommu_init_domains(iommu);
1840 * we could share the same root & context tables
1841 * amoung all IOMMU's. Need to Split it later.
1843 ret = iommu_alloc_root_entry(iommu);
1845 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1850 for_each_drhd_unit(drhd) {
1854 iommu = drhd->iommu;
1855 if (dmar_enable_qi(iommu)) {
1857 * Queued Invalidate not enabled, use Register Based
1860 iommu->flush.flush_context = __iommu_flush_context;
1861 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1862 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1864 (unsigned long long)drhd->reg_base_addr);
1866 iommu->flush.flush_context = qi_flush_context;
1867 iommu->flush.flush_iotlb = qi_flush_iotlb;
1868 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1870 (unsigned long long)drhd->reg_base_addr);
1876 * for each dev attached to rmrr
1878 * locate drhd for dev, alloc domain for dev
1879 * allocate free domain
1880 * allocate page table entries for rmrr
1881 * if context not allocated for bus
1882 * allocate and init context
1883 * set present in root table for this bus
1884 * init context with domain, translation etc
1888 for_each_rmrr_units(rmrr) {
1889 for (i = 0; i < rmrr->devices_cnt; i++) {
1890 pdev = rmrr->devices[i];
1891 /* some BIOS lists non-exist devices in DMAR table */
1894 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1897 "IOMMU: mapping reserved region failed\n");
1901 iommu_prepare_gfx_mapping();
1903 iommu_prepare_isa();
1908 * global invalidate context cache
1909 * global invalidate iotlb
1910 * enable translation
1912 for_each_drhd_unit(drhd) {
1915 iommu = drhd->iommu;
1916 sprintf (iommu->name, "dmar%d", unit++);
1918 iommu_flush_write_buffer(iommu);
1920 ret = dmar_set_interrupt(iommu);
1924 iommu_set_root_entry(iommu);
1926 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1928 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1930 iommu_disable_protect_mem_regions(iommu);
1932 ret = iommu_enable_translation(iommu);
1939 for_each_drhd_unit(drhd) {
1942 iommu = drhd->iommu;
1949 static inline u64 aligned_size(u64 host_addr, size_t size)
1952 addr = (host_addr & (~PAGE_MASK)) + size;
1953 return PAGE_ALIGN(addr);
1957 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1961 /* Make sure it's in range */
1962 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1963 if (!size || (IOVA_START_ADDR + size > end))
1966 piova = alloc_iova(&domain->iovad,
1967 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1971 static struct iova *
1972 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1973 size_t size, u64 dma_mask)
1975 struct pci_dev *pdev = to_pci_dev(dev);
1976 struct iova *iova = NULL;
1978 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1979 iova = iommu_alloc_iova(domain, size, dma_mask);
1982 * First try to allocate an io virtual address in
1983 * DMA_32BIT_MASK and if that fails then try allocating
1986 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1988 iova = iommu_alloc_iova(domain, size, dma_mask);
1992 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1999 static struct dmar_domain *
2000 get_valid_domain_for_dev(struct pci_dev *pdev)
2002 struct dmar_domain *domain;
2005 domain = get_domain_for_dev(pdev,
2006 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2009 "Allocating domain for %s failed", pci_name(pdev));
2013 /* make sure context mapping is ok */
2014 if (unlikely(!domain_context_mapped(domain, pdev))) {
2015 ret = domain_context_mapping(domain, pdev);
2018 "Domain context map for %s failed",
2027 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2028 size_t size, int dir, u64 dma_mask)
2030 struct pci_dev *pdev = to_pci_dev(hwdev);
2031 struct dmar_domain *domain;
2032 phys_addr_t start_paddr;
2037 BUG_ON(dir == DMA_NONE);
2038 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2041 domain = get_valid_domain_for_dev(pdev);
2045 size = aligned_size((u64)paddr, size);
2047 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2051 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2054 * Check if DMAR supports zero-length reads on write only
2057 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2058 !cap_zlr(domain->iommu->cap))
2059 prot |= DMA_PTE_READ;
2060 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2061 prot |= DMA_PTE_WRITE;
2063 * paddr - (paddr + size) might be partial page, we should map the whole
2064 * page. Note: if two part of one page are separately mapped, we
2065 * might have two guest_addr mapping to the same host paddr, but this
2066 * is not a big problem
2068 ret = domain_page_mapping(domain, start_paddr,
2069 ((u64)paddr) & PAGE_MASK, size, prot);
2073 /* it's a non-present to present mapping */
2074 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
2075 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2077 iommu_flush_write_buffer(domain->iommu);
2079 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2083 __free_iova(&domain->iovad, iova);
2084 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2085 pci_name(pdev), size, (unsigned long long)paddr, dir);
2089 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2090 size_t size, int dir)
2092 return __intel_map_single(hwdev, paddr, size, dir,
2093 to_pci_dev(hwdev)->dma_mask);
2096 static void flush_unmaps(void)
2102 /* just flush them all */
2103 for (i = 0; i < g_num_of_iommus; i++) {
2104 struct intel_iommu *iommu = g_iommus[i];
2108 if (deferred_flush[i].next) {
2109 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2110 DMA_TLB_GLOBAL_FLUSH, 0);
2111 for (j = 0; j < deferred_flush[i].next; j++) {
2112 __free_iova(&deferred_flush[i].domain[j]->iovad,
2113 deferred_flush[i].iova[j]);
2115 deferred_flush[i].next = 0;
2122 static void flush_unmaps_timeout(unsigned long data)
2124 unsigned long flags;
2126 spin_lock_irqsave(&async_umap_flush_lock, flags);
2128 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2131 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2133 unsigned long flags;
2136 spin_lock_irqsave(&async_umap_flush_lock, flags);
2137 if (list_size == HIGH_WATER_MARK)
2140 iommu_id = dom->iommu->seq_id;
2142 next = deferred_flush[iommu_id].next;
2143 deferred_flush[iommu_id].domain[next] = dom;
2144 deferred_flush[iommu_id].iova[next] = iova;
2145 deferred_flush[iommu_id].next++;
2148 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2152 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2155 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2158 struct pci_dev *pdev = to_pci_dev(dev);
2159 struct dmar_domain *domain;
2160 unsigned long start_addr;
2163 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2165 domain = find_domain(pdev);
2168 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2172 start_addr = iova->pfn_lo << PAGE_SHIFT;
2173 size = aligned_size((u64)dev_addr, size);
2175 pr_debug("Device %s unmapping: %lx@%llx\n",
2176 pci_name(pdev), size, (unsigned long long)start_addr);
2178 /* clear the whole page */
2179 dma_pte_clear_range(domain, start_addr, start_addr + size);
2180 /* free page tables */
2181 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2182 if (intel_iommu_strict) {
2183 if (iommu_flush_iotlb_psi(domain->iommu,
2184 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2185 iommu_flush_write_buffer(domain->iommu);
2187 __free_iova(&domain->iovad, iova);
2189 add_unmap(domain, iova);
2191 * queue up the release of the unmap to save the 1/6th of the
2192 * cpu used up by the iotlb flush operation...
2197 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2198 dma_addr_t *dma_handle, gfp_t flags)
2203 size = PAGE_ALIGN(size);
2204 order = get_order(size);
2205 flags &= ~(GFP_DMA | GFP_DMA32);
2207 vaddr = (void *)__get_free_pages(flags, order);
2210 memset(vaddr, 0, size);
2212 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2214 hwdev->coherent_dma_mask);
2217 free_pages((unsigned long)vaddr, order);
2221 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2222 dma_addr_t dma_handle)
2226 size = PAGE_ALIGN(size);
2227 order = get_order(size);
2229 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2230 free_pages((unsigned long)vaddr, order);
2233 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2235 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2236 int nelems, int dir)
2239 struct pci_dev *pdev = to_pci_dev(hwdev);
2240 struct dmar_domain *domain;
2241 unsigned long start_addr;
2245 struct scatterlist *sg;
2247 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2250 domain = find_domain(pdev);
2252 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2255 for_each_sg(sglist, sg, nelems, i) {
2256 addr = SG_ENT_VIRT_ADDRESS(sg);
2257 size += aligned_size((u64)addr, sg->length);
2260 start_addr = iova->pfn_lo << PAGE_SHIFT;
2262 /* clear the whole page */
2263 dma_pte_clear_range(domain, start_addr, start_addr + size);
2264 /* free page tables */
2265 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2267 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2268 size >> VTD_PAGE_SHIFT, 0))
2269 iommu_flush_write_buffer(domain->iommu);
2272 __free_iova(&domain->iovad, iova);
2275 static int intel_nontranslate_map_sg(struct device *hddev,
2276 struct scatterlist *sglist, int nelems, int dir)
2279 struct scatterlist *sg;
2281 for_each_sg(sglist, sg, nelems, i) {
2282 BUG_ON(!sg_page(sg));
2283 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2284 sg->dma_length = sg->length;
2289 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2294 struct pci_dev *pdev = to_pci_dev(hwdev);
2295 struct dmar_domain *domain;
2299 struct iova *iova = NULL;
2301 struct scatterlist *sg;
2302 unsigned long start_addr;
2304 BUG_ON(dir == DMA_NONE);
2305 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2306 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2308 domain = get_valid_domain_for_dev(pdev);
2312 for_each_sg(sglist, sg, nelems, i) {
2313 addr = SG_ENT_VIRT_ADDRESS(sg);
2314 addr = (void *)virt_to_phys(addr);
2315 size += aligned_size((u64)addr, sg->length);
2318 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2320 sglist->dma_length = 0;
2325 * Check if DMAR supports zero-length reads on write only
2328 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2329 !cap_zlr(domain->iommu->cap))
2330 prot |= DMA_PTE_READ;
2331 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2332 prot |= DMA_PTE_WRITE;
2334 start_addr = iova->pfn_lo << PAGE_SHIFT;
2336 for_each_sg(sglist, sg, nelems, i) {
2337 addr = SG_ENT_VIRT_ADDRESS(sg);
2338 addr = (void *)virt_to_phys(addr);
2339 size = aligned_size((u64)addr, sg->length);
2340 ret = domain_page_mapping(domain, start_addr + offset,
2341 ((u64)addr) & PAGE_MASK,
2344 /* clear the page */
2345 dma_pte_clear_range(domain, start_addr,
2346 start_addr + offset);
2347 /* free page tables */
2348 dma_pte_free_pagetable(domain, start_addr,
2349 start_addr + offset);
2351 __free_iova(&domain->iovad, iova);
2354 sg->dma_address = start_addr + offset +
2355 ((u64)addr & (~PAGE_MASK));
2356 sg->dma_length = sg->length;
2360 /* it's a non-present to present mapping */
2361 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2362 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2363 iommu_flush_write_buffer(domain->iommu);
2367 static struct dma_mapping_ops intel_dma_ops = {
2368 .alloc_coherent = intel_alloc_coherent,
2369 .free_coherent = intel_free_coherent,
2370 .map_single = intel_map_single,
2371 .unmap_single = intel_unmap_single,
2372 .map_sg = intel_map_sg,
2373 .unmap_sg = intel_unmap_sg,
2376 static inline int iommu_domain_cache_init(void)
2380 iommu_domain_cache = kmem_cache_create("iommu_domain",
2381 sizeof(struct dmar_domain),
2386 if (!iommu_domain_cache) {
2387 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2394 static inline int iommu_devinfo_cache_init(void)
2398 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2399 sizeof(struct device_domain_info),
2403 if (!iommu_devinfo_cache) {
2404 printk(KERN_ERR "Couldn't create devinfo cache\n");
2411 static inline int iommu_iova_cache_init(void)
2415 iommu_iova_cache = kmem_cache_create("iommu_iova",
2416 sizeof(struct iova),
2420 if (!iommu_iova_cache) {
2421 printk(KERN_ERR "Couldn't create iova cache\n");
2428 static int __init iommu_init_mempool(void)
2431 ret = iommu_iova_cache_init();
2435 ret = iommu_domain_cache_init();
2439 ret = iommu_devinfo_cache_init();
2443 kmem_cache_destroy(iommu_domain_cache);
2445 kmem_cache_destroy(iommu_iova_cache);
2450 static void __init iommu_exit_mempool(void)
2452 kmem_cache_destroy(iommu_devinfo_cache);
2453 kmem_cache_destroy(iommu_domain_cache);
2454 kmem_cache_destroy(iommu_iova_cache);
2458 static void __init init_no_remapping_devices(void)
2460 struct dmar_drhd_unit *drhd;
2462 for_each_drhd_unit(drhd) {
2463 if (!drhd->include_all) {
2465 for (i = 0; i < drhd->devices_cnt; i++)
2466 if (drhd->devices[i] != NULL)
2468 /* ignore DMAR unit if no pci devices exist */
2469 if (i == drhd->devices_cnt)
2477 for_each_drhd_unit(drhd) {
2479 if (drhd->ignored || drhd->include_all)
2482 for (i = 0; i < drhd->devices_cnt; i++)
2483 if (drhd->devices[i] &&
2484 !IS_GFX_DEVICE(drhd->devices[i]))
2487 if (i < drhd->devices_cnt)
2490 /* bypass IOMMU if it is just for gfx devices */
2492 for (i = 0; i < drhd->devices_cnt; i++) {
2493 if (!drhd->devices[i])
2495 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2500 int __init intel_iommu_init(void)
2504 if (dmar_table_init())
2507 if (dmar_dev_scope_init())
2511 * Check the need for DMA-remapping initialization now.
2512 * Above initialization will also be used by Interrupt-remapping.
2514 if (no_iommu || swiotlb || dmar_disabled)
2517 iommu_init_mempool();
2518 dmar_init_reserved_ranges();
2520 init_no_remapping_devices();
2524 printk(KERN_ERR "IOMMU: dmar init failed\n");
2525 put_iova_domain(&reserved_iova_list);
2526 iommu_exit_mempool();
2530 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2532 init_timer(&unmap_timer);
2534 dma_ops = &intel_dma_ops;
2538 void intel_iommu_domain_exit(struct dmar_domain *domain)
2542 /* Domain 0 is reserved, so dont process it */
2546 end = DOMAIN_MAX_ADDR(domain->gaw);
2547 end = end & (~VTD_PAGE_MASK);
2550 dma_pte_clear_range(domain, 0, end);
2552 /* free page tables */
2553 dma_pte_free_pagetable(domain, 0, end);
2555 iommu_free_domain(domain);
2556 free_domain_mem(domain);
2558 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2560 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2562 struct dmar_drhd_unit *drhd;
2563 struct dmar_domain *domain;
2564 struct intel_iommu *iommu;
2566 drhd = dmar_find_matched_drhd_unit(pdev);
2568 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2572 iommu = drhd->iommu;
2575 "intel_iommu_domain_alloc: iommu == NULL\n");
2578 domain = iommu_alloc_domain(iommu);
2581 "intel_iommu_domain_alloc: domain == NULL\n");
2584 if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2586 "intel_iommu_domain_alloc: domain_init() failed\n");
2587 intel_iommu_domain_exit(domain);
2592 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2594 int intel_iommu_context_mapping(
2595 struct dmar_domain *domain, struct pci_dev *pdev)
2598 rc = domain_context_mapping(domain, pdev);
2601 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2603 int intel_iommu_page_mapping(
2604 struct dmar_domain *domain, dma_addr_t iova,
2605 u64 hpa, size_t size, int prot)
2608 rc = domain_page_mapping(domain, iova, hpa, size, prot);
2611 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2613 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2615 detach_domain_for_dev(domain, bus, devfn);
2617 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2619 struct dmar_domain *
2620 intel_iommu_find_domain(struct pci_dev *pdev)
2622 return find_domain(pdev);
2624 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2626 int intel_iommu_found(void)
2628 return g_num_of_iommus;
2630 EXPORT_SYMBOL_GPL(intel_iommu_found);
2632 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2634 struct dma_pte *pte;
2638 pte = addr_to_dma_pte(domain, iova);
2641 pfn = dma_pte_addr(pte);
2643 return pfn >> VTD_PAGE_SHIFT;
2645 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);