select NUMA
        select ACPI_NUMA
        select SWIOTLB
+       select PCI_MSI
        help
          This selects the system type of your hardware.  A "generic" kernel
          will run on any supported IA-64 system.  However, if you configure
 
          generic               For any supported IA-64 system
          DIG-compliant         For DIG ("Developer's Interface Guide") compliant systems
+         DIG+Intel+IOMMU       For DIG systems with Intel IOMMU
          HP-zx1/sx1000         For HP systems
          HP-zx1/sx1000+swiotlb For HP systems with (broken) DMA-constrained devices.
          SGI-SN2               For SGI Altix systems
        bool "DIG-compliant"
        select SWIOTLB
 
+config IA64_DIG_VTD
+       bool "DIG+Intel+IOMMU"
+       select DMAR
+       select PCI_MSI
+
 config IA64_HP_ZX1
        bool "HP-zx1/sx1000"
        help
 
 source "drivers/pcmcia/Kconfig"
 
+config DMAR
+        bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
+        depends on IA64_GENERIC && ACPI && EXPERIMENTAL
+       help
+         DMA remapping (DMAR) devices support enables independent address
+         translations for Direct Memory Access (DMA) from devices.
+         These DMA remapping devices are reported via ACPI tables
+         and include PCI device scope covered by these DMA
+         remapping devices.
+
 endmenu
 
 endif
 
 core-y                         += arch/ia64/kernel/ arch/ia64/mm/
 core-$(CONFIG_IA32_SUPPORT)    += arch/ia64/ia32/
 core-$(CONFIG_IA64_DIG)        += arch/ia64/dig/
+core-$(CONFIG_IA64_DIG_VTD)    += arch/ia64/dig/
 core-$(CONFIG_IA64_GENERIC)    += arch/ia64/dig/
 core-$(CONFIG_IA64_HP_ZX1)     += arch/ia64/dig/
 core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
 
 CONFIG_BINFMT_ELF=y
 CONFIG_BINFMT_MISC=m
 
+# CONFIG_DMAR is not set
+
 #
 # Power management and ACPI
 #
 
 CONFIG_BINFMT_ELF=y
 CONFIG_BINFMT_MISC=m
 
+# CONFIG_DMAR is not set
+
 #
 # Power management and ACPI
 #
 
 #
 
 obj-y := setup.o
+ifeq ($(CONFIG_DMAR), y)
+obj-$(CONFIG_IA64_GENERIC) += machvec.o machvec_vtd.o dig_vtd_iommu.o
+else
 obj-$(CONFIG_IA64_GENERIC) += machvec.o
+endif
+obj-$(CONFIG_IA64_DIG_VTD) += dig_vtd_iommu.o
 
--- /dev/null
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/intel-iommu.h>
+
+void *
+vtd_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+                gfp_t flags)
+{
+       return intel_alloc_coherent(dev, size, dma_handle, flags);
+}
+EXPORT_SYMBOL_GPL(vtd_alloc_coherent);
+
+void
+vtd_free_coherent(struct device *dev, size_t size, void *vaddr,
+                dma_addr_t dma_handle)
+{
+       intel_free_coherent(dev, size, vaddr, dma_handle);
+}
+EXPORT_SYMBOL_GPL(vtd_free_coherent);
+
+dma_addr_t
+vtd_map_single_attrs(struct device *dev, void *addr, size_t size,
+                    int dir, struct dma_attrs *attrs)
+{
+       return intel_map_single(dev, (phys_addr_t)addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_map_single_attrs);
+
+void
+vtd_unmap_single_attrs(struct device *dev, dma_addr_t iova, size_t size,
+                      int dir, struct dma_attrs *attrs)
+{
+       intel_unmap_single(dev, iova, size, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_unmap_single_attrs);
+
+int
+vtd_map_sg_attrs(struct device *dev, struct scatterlist *sglist, int nents,
+                int dir, struct dma_attrs *attrs)
+{
+       return intel_map_sg(dev, sglist, nents, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_map_sg_attrs);
+
+void
+vtd_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
+                  int nents, int dir, struct dma_attrs *attrs)
+{
+       intel_unmap_sg(dev, sglist, nents, dir);
+}
+EXPORT_SYMBOL_GPL(vtd_unmap_sg_attrs);
+
+int
+vtd_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+       return 0;
+}
+EXPORT_SYMBOL_GPL(vtd_dma_mapping_error);
 
--- /dev/null
+#define MACHVEC_PLATFORM_NAME          dig_vtd
+#define MACHVEC_PLATFORM_HEADER                <asm/machvec_dig_vtd.h>
+#include <asm/machvec_init.h>
 
 #define flush_dcache_mmap_unlock(mapping)      do { } while (0)
 
 extern void flush_icache_range (unsigned long start, unsigned long end);
+extern void clflush_cache_range(void *addr, int size);
+
 
 #define flush_icache_user_range(vma, page, user_addr, len)                                     \
 do {                                                                                           \
 
 #ifdef CONFIG_ACPI
        void    *acpi_handle;
 #endif
+#ifdef CONFIG_DMAR
+       void *iommu; /* hook for IOMMU specific extension */
+#endif
 };
 
 #endif /* _ASM_IA64_DEVICE_H */
 
  */
 #include <asm/machvec.h>
 #include <linux/scatterlist.h>
+#include <asm/swiotlb.h>
+
+struct dma_mapping_ops {
+       int             (*mapping_error)(struct device *dev,
+                                        dma_addr_t dma_addr);
+       void*           (*alloc_coherent)(struct device *dev, size_t size,
+                               dma_addr_t *dma_handle, gfp_t gfp);
+       void            (*free_coherent)(struct device *dev, size_t size,
+                               void *vaddr, dma_addr_t dma_handle);
+       dma_addr_t      (*map_single)(struct device *hwdev, unsigned long ptr,
+                               size_t size, int direction);
+       void            (*unmap_single)(struct device *dev, dma_addr_t addr,
+                               size_t size, int direction);
+       void            (*sync_single_for_cpu)(struct device *hwdev,
+                               dma_addr_t dma_handle, size_t size,
+                               int direction);
+       void            (*sync_single_for_device)(struct device *hwdev,
+                               dma_addr_t dma_handle, size_t size,
+                               int direction);
+       void            (*sync_single_range_for_cpu)(struct device *hwdev,
+                               dma_addr_t dma_handle, unsigned long offset,
+                               size_t size, int direction);
+       void            (*sync_single_range_for_device)(struct device *hwdev,
+                               dma_addr_t dma_handle, unsigned long offset,
+                               size_t size, int direction);
+       void            (*sync_sg_for_cpu)(struct device *hwdev,
+                               struct scatterlist *sg, int nelems,
+                               int direction);
+       void            (*sync_sg_for_device)(struct device *hwdev,
+                               struct scatterlist *sg, int nelems,
+                               int direction);
+       int             (*map_sg)(struct device *hwdev, struct scatterlist *sg,
+                               int nents, int direction);
+       void            (*unmap_sg)(struct device *hwdev,
+                               struct scatterlist *sg, int nents,
+                               int direction);
+       int             (*dma_supported_op)(struct device *hwdev, u64 mask);
+       int             is_phys;
+};
+
+extern struct dma_mapping_ops *dma_ops;
+extern struct ia64_machine_vector ia64_mv;
+extern void set_iommu_machvec(void);
 
 #define dma_alloc_coherent(dev, size, handle, gfp)     \
        platform_dma_alloc_coherent(dev, size, handle, (gfp) | GFP_DMA)
 
 #define dma_is_consistent(d, h)        (1)     /* all we do is coherent memory... */
 
+static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
+{
+       return dma_ops;
+}
+
+
+
 #endif /* _ASM_IA64_DMA_MAPPING_H */
 
--- /dev/null
+#ifndef _ASM_IA64_IOMMU_H
+#define _ASM_IA64_IOMMU_H 1
+
+#define cpu_has_x2apic 0
+/* 10 seconds */
+#define DMAR_OPERATION_TIMEOUT (((cycles_t) local_cpu_data->itc_freq)*10)
+
+extern void pci_iommu_shutdown(void);
+extern void no_iommu_init(void);
+extern int force_iommu, no_iommu;
+extern int iommu_detected;
+extern void iommu_dma_init(void);
+extern void machvec_init(const char *name);
+extern int forbid_dac;
+
+#endif
 
 #  include <asm/machvec_hpsim.h>
 # elif defined (CONFIG_IA64_DIG)
 #  include <asm/machvec_dig.h>
+# elif defined(CONFIG_IA64_DIG_VTD)
+#  include <asm/machvec_dig_vtd.h>
 # elif defined (CONFIG_IA64_HP_ZX1)
 #  include <asm/machvec_hpzx1.h>
 # elif defined (CONFIG_IA64_HP_ZX1_SWIOTLB)
 
--- /dev/null
+#ifndef _ASM_IA64_MACHVEC_DIG_VTD_h
+#define _ASM_IA64_MACHVEC_DIG_VTD_h
+
+extern ia64_mv_setup_t                 dig_setup;
+extern ia64_mv_dma_alloc_coherent      vtd_alloc_coherent;
+extern ia64_mv_dma_free_coherent       vtd_free_coherent;
+extern ia64_mv_dma_map_single_attrs    vtd_map_single_attrs;
+extern ia64_mv_dma_unmap_single_attrs  vtd_unmap_single_attrs;
+extern ia64_mv_dma_map_sg_attrs                vtd_map_sg_attrs;
+extern ia64_mv_dma_unmap_sg_attrs      vtd_unmap_sg_attrs;
+extern ia64_mv_dma_supported           iommu_dma_supported;
+extern ia64_mv_dma_mapping_error       vtd_dma_mapping_error;
+extern ia64_mv_dma_init                        pci_iommu_alloc;
+
+/*
+ * This stuff has dual use!
+ *
+ * For a generic kernel, the macros are used to initialize the
+ * platform's machvec structure.  When compiling a non-generic kernel,
+ * the macros are used directly.
+ */
+#define platform_name                          "dig_vtd"
+#define platform_setup                         dig_setup
+#define platform_dma_init                      pci_iommu_alloc
+#define platform_dma_alloc_coherent            vtd_alloc_coherent
+#define platform_dma_free_coherent             vtd_free_coherent
+#define platform_dma_map_single_attrs          vtd_map_single_attrs
+#define platform_dma_unmap_single_attrs                vtd_unmap_single_attrs
+#define platform_dma_map_sg_attrs              vtd_map_sg_attrs
+#define platform_dma_unmap_sg_attrs            vtd_unmap_sg_attrs
+#define platform_dma_sync_single_for_cpu       machvec_dma_sync_single
+#define platform_dma_sync_sg_for_cpu           machvec_dma_sync_sg
+#define platform_dma_sync_single_for_device    machvec_dma_sync_single
+#define platform_dma_sync_sg_for_device                machvec_dma_sync_sg
+#define platform_dma_supported                 iommu_dma_supported
+#define platform_dma_mapping_error             vtd_dma_mapping_error
+
+#endif /* _ASM_IA64_MACHVEC_DIG_VTD_h */
 
+#include <asm/iommu.h>
 #include <asm/machvec.h>
 
 extern ia64_mv_send_ipi_t ia64_send_ipi;
 
        return channel ? isa_irq_to_vector(15) : isa_irq_to_vector(14);
 }
 
+#ifdef CONFIG_DMAR
+extern void pci_iommu_alloc(void);
+#endif
 #endif /* _ASM_IA64_PCI_H */
 
--- /dev/null
+#ifndef ASM_IA64__SWIOTLB_H
+#define ASM_IA64__SWIOTLB_H
+
+#include <linux/dma-mapping.h>
+
+/* SWIOTLB interface */
+
+extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr,
+                                    size_t size, int dir);
+extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+                                   dma_addr_t *dma_handle, gfp_t flags);
+extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+                                size_t size, int dir);
+extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
+                                       dma_addr_t dev_addr,
+                                       size_t size, int dir);
+extern void swiotlb_sync_single_for_device(struct device *hwdev,
+                                          dma_addr_t dev_addr,
+                                          size_t size, int dir);
+extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev,
+                                             dma_addr_t dev_addr,
+                                             unsigned long offset,
+                                             size_t size, int dir);
+extern void swiotlb_sync_single_range_for_device(struct device *hwdev,
+                                                dma_addr_t dev_addr,
+                                                unsigned long offset,
+                                                size_t size, int dir);
+extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
+                                   struct scatterlist *sg, int nelems,
+                                   int dir);
+extern void swiotlb_sync_sg_for_device(struct device *hwdev,
+                                      struct scatterlist *sg, int nelems,
+                                      int dir);
+extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
+                         int nents, int direction);
+extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
+                            int nents, int direction);
+extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
+extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
+                                 void *vaddr, dma_addr_t dma_handle);
+extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
+extern void swiotlb_init(void);
+
+extern int swiotlb_force;
+
+#ifdef CONFIG_SWIOTLB
+extern int swiotlb;
+extern void pci_swiotlb_init(void);
+#else
+#define swiotlb 0
+static inline void pci_swiotlb_init(void)
+{
+}
+#endif
+
+#endif /* ASM_IA64__SWIOTLB_H */
 
 ifneq ($(CONFIG_IA64_ESI),)
 obj-y                          += esi_stub.o   # must be in kernel proper
 endif
+obj-$(CONFIG_DMAR)             += pci-dma.o
+ifeq ($(CONFIG_DMAR), y)
+obj-$(CONFIG_SWIOTLB)          += pci-swiotlb.o
+endif
 
 # The gate DSO image is built using a special linker script.
 targets += gate.so gate-syms.o
 
        struct acpi_table_rsdp *rsdp;
        struct acpi_table_xsdt *xsdt;
        struct acpi_table_header *hdr;
+#ifdef CONFIG_DMAR
+       u64 i, nentries;
+#endif
 
        rsdp_phys = acpi_find_rsdp();
        if (!rsdp_phys) {
                        return "sn2";
        }
 
+#ifdef CONFIG_DMAR
+       /* Look for Intel IOMMU */
+       nentries = (hdr->length - sizeof(*hdr)) /
+                        sizeof(xsdt->table_offset_entry[0]);
+       for (i = 0; i < nentries; i++) {
+               hdr = __va(xsdt->table_offset_entry[i]);
+               if (strncmp(hdr->signature, ACPI_SIG_DMAR,
+                       sizeof(ACPI_SIG_DMAR) - 1) == 0)
+                       return "dig_vtd";
+       }
+#endif
+
        return "dig";
 #else
 # if defined (CONFIG_IA64_HP_SIM)
        return "uv";
 # elif defined (CONFIG_IA64_DIG)
        return "dig";
+# elif defined(CONFIG_IA64_DIG_VTD)
+       return "dig_vtd";
 # else
 #      error Unknown platform.  Fix acpi.c.
 # endif
 
 #include <linux/pci.h>
 #include <linux/irq.h>
 #include <linux/msi.h>
+#include <linux/dmar.h>
 #include <asm/smp.h>
 
 /*
 
        return ia64_teardown_msi_irq(irq);
 }
+
+#ifdef CONFIG_DMAR
+#ifdef CONFIG_SMP
+static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       struct msi_msg msg;
+       int cpu = first_cpu(mask);
+
+
+       if (!cpu_online(cpu))
+               return;
+
+       if (irq_prepare_move(irq, cpu))
+               return;
+
+       dmar_msi_read(irq, &msg);
+
+       msg.data &= ~MSI_DATA_VECTOR_MASK;
+       msg.data |= MSI_DATA_VECTOR(cfg->vector);
+       msg.address_lo &= ~MSI_ADDR_DESTID_MASK;
+       msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
+
+       dmar_msi_write(irq, &msg);
+       irq_desc[irq].affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+struct irq_chip dmar_msi_type = {
+       .name = "DMAR_MSI",
+       .unmask = dmar_msi_unmask,
+       .mask = dmar_msi_mask,
+       .ack = ia64_ack_msi_irq,
+#ifdef CONFIG_SMP
+       .set_affinity = dmar_msi_set_affinity,
+#endif
+       .retrigger = ia64_msi_retrigger_irq,
+};
+
+static int
+msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+{
+       struct irq_cfg *cfg = irq_cfg + irq;
+       unsigned dest;
+       cpumask_t mask;
+
+       cpus_and(mask, irq_to_domain(irq), cpu_online_map);
+       dest = cpu_physical_id(first_cpu(mask));
+
+       msg->address_hi = 0;
+       msg->address_lo =
+               MSI_ADDR_HEADER |
+               MSI_ADDR_DESTMODE_PHYS |
+               MSI_ADDR_REDIRECTION_CPU |
+               MSI_ADDR_DESTID_CPU(dest);
+
+       msg->data =
+               MSI_DATA_TRIGGER_EDGE |
+               MSI_DATA_LEVEL_ASSERT |
+               MSI_DATA_DELIVERY_FIXED |
+               MSI_DATA_VECTOR(cfg->vector);
+       return 0;
+}
+
+int arch_setup_dmar_msi(unsigned int irq)
+{
+       int ret;
+       struct msi_msg msg;
+
+       ret = msi_compose_msg(NULL, irq, &msg);
+       if (ret < 0)
+               return ret;
+       dmar_msi_write(irq, &msg);
+       set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+               "edge");
+       return 0;
+}
+#endif /* CONFIG_DMAR */
+
 
--- /dev/null
+/*
+ * Dynamic DMA mapping support.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/dmar.h>
+#include <asm/iommu.h>
+#include <asm/machvec.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/machvec.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_DMAR
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/page.h>
+#include <asm/iommu.h>
+
+dma_addr_t bad_dma_address __read_mostly;
+EXPORT_SYMBOL(bad_dma_address);
+
+static int iommu_sac_force __read_mostly;
+
+int no_iommu __read_mostly;
+#ifdef CONFIG_IOMMU_DEBUG
+int force_iommu __read_mostly = 1;
+#else
+int force_iommu __read_mostly;
+#endif
+
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly;
+
+/* Dummy device used for NULL arguments (normally ISA). Better would
+   be probably a smaller DMA mask, but this is bug-to-bug compatible
+   to i386. */
+struct device fallback_dev = {
+       .bus_id = "fallback device",
+       .coherent_dma_mask = DMA_32BIT_MASK,
+       .dma_mask = &fallback_dev.coherent_dma_mask,
+};
+
+void __init pci_iommu_alloc(void)
+{
+       /*
+        * The order of these functions is important for
+        * fall-back/fail-over reasons
+        */
+       detect_intel_iommu();
+
+#ifdef CONFIG_SWIOTLB
+       pci_swiotlb_init();
+#endif
+}
+
+static int __init pci_iommu_init(void)
+{
+       if (iommu_detected)
+               intel_iommu_init();
+
+       return 0;
+}
+
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
+
+void pci_iommu_shutdown(void)
+{
+       return;
+}
+
+void __init
+iommu_dma_init(void)
+{
+       return;
+}
+
+struct dma_mapping_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
+
+int iommu_dma_supported(struct device *dev, u64 mask)
+{
+       struct dma_mapping_ops *ops = get_dma_ops(dev);
+
+#ifdef CONFIG_PCI
+       if (mask > 0xffffffff && forbid_dac > 0) {
+               dev_info(dev, "Disallowing DAC for device\n");
+               return 0;
+       }
+#endif
+
+       if (ops->dma_supported_op)
+               return ops->dma_supported_op(dev, mask);
+
+       /* Copied from i386. Doesn't make much sense, because it will
+          only work for pci_alloc_coherent.
+          The caller just has to use GFP_DMA in this case. */
+       if (mask < DMA_24BIT_MASK)
+               return 0;
+
+       /* Tell the device to use SAC when IOMMU force is on.  This
+          allows the driver to use cheaper accesses in some cases.
+
+          Problem with this is that if we overflow the IOMMU area and
+          return DAC as fallback address the device may not handle it
+          correctly.
+
+          As a special case some controllers have a 39bit address
+          mode that is as efficient as 32bit (aic79xx). Don't force
+          SAC for these.  Assume all masks <= 40 bits are of this
+          type. Normally this doesn't make any difference, but gives
+          more gentle handling of IOMMU overflow. */
+       if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+               dev_info(dev, "Force SAC with mask %lx\n", mask);
+               return 0;
+       }
+
+       return 1;
+}
+EXPORT_SYMBOL(iommu_dma_supported);
+
+#endif
 
--- /dev/null
+/* Glue code to lib/swiotlb.c */
+
+#include <linux/pci.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/iommu.h>
+#include <asm/machvec.h>
+
+int swiotlb __read_mostly;
+EXPORT_SYMBOL(swiotlb);
+
+struct dma_mapping_ops swiotlb_dma_ops = {
+       .mapping_error = swiotlb_dma_mapping_error,
+       .alloc_coherent = swiotlb_alloc_coherent,
+       .free_coherent = swiotlb_free_coherent,
+       .map_single = swiotlb_map_single,
+       .unmap_single = swiotlb_unmap_single,
+       .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
+       .sync_single_for_device = swiotlb_sync_single_for_device,
+       .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
+       .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
+       .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
+       .sync_sg_for_device = swiotlb_sync_sg_for_device,
+       .map_sg = swiotlb_map_sg,
+       .unmap_sg = swiotlb_unmap_sg,
+       .dma_supported_op = swiotlb_dma_supported,
+};
+
+void __init pci_swiotlb_init(void)
+{
+       if (!iommu_detected) {
+#ifdef CONFIG_IA64_GENERIC
+               swiotlb = 1;
+               printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n");
+               machvec_init("dig");
+               swiotlb_init();
+               dma_ops = &swiotlb_dma_ops;
+#else
+               panic("Unable to find Intel IOMMU");
+#endif
+       }
+}
 
  */
 #define        I_CACHE_STRIDE_SHIFT    5       /* Safest way to go: 32 bytes by 32 bytes */
 unsigned long ia64_i_cache_stride_shift = ~0;
+/*
+ * "clflush_cache_range()" needs to know what processor dependent stride size to
+ * use when it flushes cache lines including both d-cache and i-cache.
+ */
+/* Safest way to go: 32 bytes by 32 bytes */
+#define        CACHE_STRIDE_SHIFT      5
+unsigned long ia64_cache_stride_shift = ~0;
 
 /*
  * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1).  This
 }
 
 /*
- * Calculate the max. cache line size.
+ * Do the following calculations:
  *
- * In addition, the minimum of the i-cache stride sizes is calculated for
- * "flush_icache_range()".
+ * 1. the max. cache line size.
+ * 2. the minimum of the i-cache stride sizes for "flush_icache_range()".
+ * 3. the minimum of the cache stride sizes for "clflush_cache_range()".
  */
 static void __cpuinit
-get_max_cacheline_size (void)
+get_cache_info(void)
 {
        unsigned long line_size, max = 1;
        u64 l, levels, unique_caches;
                 max = SMP_CACHE_BYTES;
                /* Safest setup for "flush_icache_range()" */
                ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
+               /* Safest setup for "clflush_cache_range()" */
+               ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
                goto out;
         }
 
        for (l = 0; l < levels; ++l) {
-               status = ia64_pal_cache_config_info(l, /* cache_type (data_or_unified)= */ 2,
-                                                   &cci);
+               /* cache_type (data_or_unified)=2 */
+               status = ia64_pal_cache_config_info(l, 2, &cci);
                if (status != 0) {
                        printk(KERN_ERR
                               "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n",
                        max = SMP_CACHE_BYTES;
                        /* The safest setup for "flush_icache_range()" */
                        cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+                       /* The safest setup for "clflush_cache_range()" */
+                       ia64_cache_stride_shift = CACHE_STRIDE_SHIFT;
                        cci.pcci_unified = 1;
+               } else {
+                       if (cci.pcci_stride < ia64_cache_stride_shift)
+                               ia64_cache_stride_shift = cci.pcci_stride;
+
+                       line_size = 1 << cci.pcci_line_size;
+                       if (line_size > max)
+                               max = line_size;
                }
-               line_size = 1 << cci.pcci_line_size;
-               if (line_size > max)
-                       max = line_size;
+
                if (!cci.pcci_unified) {
-                       status = ia64_pal_cache_config_info(l,
-                                                   /* cache_type (instruction)= */ 1,
-                                                   &cci);
+                       /* cache_type (instruction)=1*/
+                       status = ia64_pal_cache_config_info(l, 1, &cci);
                        if (status != 0) {
                                printk(KERN_ERR
                                "%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n",
        }
 #endif
 
-       get_max_cacheline_size();
+       get_cache_info();
 
        /*
         * We can't pass "local_cpu_data" to identify_cpu() because we haven't called
 
        mov     ar.lc=r3                // restore ar.lc
        br.ret.sptk.many rp
 END(flush_icache_range)
+
+       /*
+        * clflush_cache_range(start,size)
+        *
+        *      Flush cache lines from start to start+size-1.
+        *
+        *      Must deal with range from start to start+size-1 but nothing else
+        *      (need to be careful not to touch addresses that may be
+        *      unmapped).
+        *
+        *      Note: "in0" and "in1" are preserved for debugging purposes.
+        */
+       .section .kprobes.text,"ax"
+GLOBAL_ENTRY(clflush_cache_range)
+
+       .prologue
+       alloc   r2=ar.pfs,2,0,0,0
+       movl    r3=ia64_cache_stride_shift
+       mov     r21=1
+       add     r22=in1,in0
+       ;;
+       ld8     r20=[r3]                // r20: stride shift
+       sub     r22=r22,r0,1            // last byte address
+       ;;
+       shr.u   r23=in0,r20             // start / (stride size)
+       shr.u   r22=r22,r20             // (last byte address) / (stride size)
+       shl     r21=r21,r20             // r21: stride size of the i-cache(s)
+       ;;
+       sub     r8=r22,r23              // number of strides - 1
+       shl     r24=r23,r20             // r24: addresses for "fc" =
+                                       //      "start" rounded down to stride
+                                       //      boundary
+       .save   ar.lc,r3
+       mov     r3=ar.lc                // save ar.lc
+       ;;
+
+       .body
+       mov     ar.lc=r8
+       ;;
+       /*
+        * 32 byte aligned loop, even number of (actually 2) bundles
+        */
+.Loop_fc:
+       fc      r24             // issuable on M0 only
+       add     r24=r21,r24     // we flush "stride size" bytes per iteration
+       nop.i   0
+       br.cloop.sptk.few .Loop_fc
+       ;;
+       sync.i
+       ;;
+       srlz.i
+       ;;
+       mov     ar.lc=r3                // restore ar.lc
+       br.ret.sptk.many rp
+END(clflush_cache_range)