return err;             /* That will often be -EOPNOTSUPP */
 }
 
+/*
+ * swap allocation tell device that a cluster of swap can now be discarded,
+ * to allow the swap device to optimize its wear-levelling.
+ */
+static void discard_swap_cluster(struct swap_info_struct *si,
+                                pgoff_t start_page, pgoff_t nr_pages)
+{
+       struct swap_extent *se = si->curr_swap_extent;
+       int found_extent = 0;
+
+       while (nr_pages) {
+               struct list_head *lh;
+
+               if (se->start_page <= start_page &&
+                   start_page < se->start_page + se->nr_pages) {
+                       pgoff_t offset = start_page - se->start_page;
+                       sector_t start_block = se->start_block + offset;
+                       pgoff_t nr_blocks = se->nr_pages - offset;
+
+                       if (nr_blocks > nr_pages)
+                               nr_blocks = nr_pages;
+                       start_page += nr_blocks;
+                       nr_pages -= nr_blocks;
+
+                       if (!found_extent++)
+                               si->curr_swap_extent = se;
+
+                       start_block <<= PAGE_SHIFT - 9;
+                       nr_blocks <<= PAGE_SHIFT - 9;
+                       if (blkdev_issue_discard(si->bdev, start_block,
+                                                       nr_blocks, GFP_NOIO))
+                               break;
+               }
+
+               lh = se->list.next;
+               if (lh == &si->extent_list)
+                       lh = lh->next;
+               se = list_entry(lh, struct swap_extent, list);
+       }
+}
+
+static int wait_for_discard(void *word)
+{
+       schedule();
+       return 0;
+}
+
 #define SWAPFILE_CLUSTER       256
 #define LATENCY_LIMIT          256
 
 static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 {
        unsigned long offset;
-       unsigned long last_in_cluster;
+       unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
+       int found_free_cluster = 0;
 
        /*
         * We try to cluster swap pages by allocating them sequentially
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }
+               if (si->flags & SWP_DISCARDABLE) {
+                       /*
+                        * Start range check on racing allocations, in case
+                        * they overlap the cluster we eventually decide on
+                        * (we scan without swap_lock to allow preemption).
+                        * It's hardly conceivable that cluster_nr could be
+                        * wrapped during our scan, but don't depend on it.
+                        */
+                       if (si->lowest_alloc)
+                               goto checks;
+                       si->lowest_alloc = si->max;
+                       si->highest_alloc = 0;
+               }
                spin_unlock(&swap_lock);
 
                offset = si->lowest_bit;
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+                               found_free_cluster = 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
                offset = si->lowest_bit;
                spin_lock(&swap_lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
+               si->lowest_alloc = 0;
        }
 
 checks:
        si->swap_map[offset] = 1;
        si->cluster_next = offset + 1;
        si->flags -= SWP_SCANNING;
+
+       if (si->lowest_alloc) {
+               /*
+                * Only set when SWP_DISCARDABLE, and there's a scan
+                * for a free cluster in progress or just completed.
+                */
+               if (found_free_cluster) {
+                       /*
+                        * To optimize wear-levelling, discard the
+                        * old data of the cluster, taking care not to
+                        * discard any of its pages that have already
+                        * been allocated by racing tasks (offset has
+                        * already stepped over any at the beginning).
+                        */
+                       if (offset < si->highest_alloc &&
+                           si->lowest_alloc <= last_in_cluster)
+                               last_in_cluster = si->lowest_alloc - 1;
+                       si->flags |= SWP_DISCARDING;
+                       spin_unlock(&swap_lock);
+
+                       if (offset < last_in_cluster)
+                               discard_swap_cluster(si, offset,
+                                       last_in_cluster - offset + 1);
+
+                       spin_lock(&swap_lock);
+                       si->lowest_alloc = 0;
+                       si->flags &= ~SWP_DISCARDING;
+
+                       smp_mb();       /* wake_up_bit advises this */
+                       wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
+
+               } else if (si->flags & SWP_DISCARDING) {
+                       /*
+                        * Delay using pages allocated by racing tasks
+                        * until the whole discard has been issued. We
+                        * could defer that delay until swap_writepage,
+                        * but it's easier to keep this self-contained.
+                        */
+                       spin_unlock(&swap_lock);
+                       wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
+                               wait_for_discard, TASK_UNINTERRUPTIBLE);
+                       spin_lock(&swap_lock);
+               } else {
+                       /*
+                        * Note pages allocated by racing tasks while
+                        * scan for a free cluster is in progress, so
+                        * that its final discard can exclude them.
+                        */
+                       if (offset < si->lowest_alloc)
+                               si->lowest_alloc = offset;
+                       if (offset > si->highest_alloc)
+                               si->highest_alloc = offset;
+               }
+       }
        return offset;
 
 scan: