LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
@ 2011-02-07  3:26 Dan Magenheimer
  2011-02-08 22:48 ` Minchan Kim
  2011-02-15 16:53 ` Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 13+ messages in thread
From: Dan Magenheimer @ 2011-02-07  3:26 UTC (permalink / raw)
  To: gregkh, chris.mason, akpm, torvalds, matthew, linux-kernel,
	linux-mm, ngupta, jeremy, kurt.hackel, npiggin, riel,
	konrad.wilk, dan.magenheimer, mel, minchan.kim, kosaki.motohiro,
	sfr, wfg, tytso, viro, hughd, hannes

[PATCH V2 2/3] drivers/staging: zcache: host services and PAM services

Zcache provides host services (memory allocation) for tmem,
a "shim" to interface cleancache and frontswap to tmem, and
two different page-addressable memory implemenations using
lzo1x compression.  The first, "compression buddies" ("zbud")
compresses pairs of pages and supplies a shrinker interface
that allows entire pages to be reclaimed.  The second is
a shim to xvMalloc which is more space-efficient but
less receptive to page reclamation.  The first is used
for ephemeral pools and the second for persistent pools.
All ephemeral pools share the same memory, that is, even
pages from different pools can share the same page.

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Signed-off-by: Nitin Gupta <ngupta@vflare.org>

---

Diffstat:
 drivers/staging/zcache/zcache.c          | 1657 +++++++++++++++++++++
 1 file changed, 1657 insertions(+)

--- linux-2.6.37/drivers/staging/zcache/zcache.c	1969-12-31 17:00:00.000000000 -0700
+++ linux-2.6.37-zcache/drivers/staging/zcache/zcache.c	2011-02-05 15:52:26.000000000 -0700
@@ -0,0 +1,1657 @@
+/*
+ * zcache.c
+ *
+ * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
+ * Copyright (c) 2010,2011, Nitin Gupta
+ *
+ * Zcache provides an in-kernel "host implementation" for transcendent memory
+ * and, thus indirectly, for cleancache and frontswap.  Zcache includes two
+ * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
+ * 1) "compression buddies" ("zbud") is used for ephemeral pages
+ * 2) xvmalloc is used for persistent pages.
+ * Xvmalloc (based on the TLSF allocator) has very low fragmentation
+ * so maximizes space efficiency, while zbud allows pairs (and potentially,
+ * in the future, more than a pair of) compressed pages to be closely linked
+ * so that reclaiming can be done via the kernel's physical-page-oriented
+ * "shrinker" interface.
+ *
+ * [1] For a definition of page-accessible memory (aka PAM), see:
+ *   http://marc.info/?l=linux-mm&m=127811271605009
+ */
+
+#include <linux/cpu.h>
+#include <linux/highmem.h>
+#include <linux/list.h>
+#include <linux/lzo.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include "tmem.h"
+
+#include "../zram/xvmalloc.h" /* if built in drivers/staging */
+
+#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
+#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
+#endif
+#ifdef CONFIG_CLEANCACHE
+#include <linux/cleancache.h>
+#endif
+#ifdef CONFIG_FRONTSWAP
+#include <linux/frontswap.h>
+#endif
+
+#if 0
+/* this is more aggressive but may cause other problems? */
+#define ZCACHE_GFP_MASK	(GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
+#else
+#define ZCACHE_GFP_MASK \
+	(__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
+#endif
+
+/**********
+ * Compression buddies ("zbud") provides for packing two (or, possibly
+ * in the future, more) compressed ephemeral pages into a single "raw"
+ * (physical) page and tracking them with data structures so that
+ * the raw pages can be easily reclaimed.
+ *
+ * A zbud page ("zbpg") is an aligned page containing a list_head,
+ * a lock, and two "zbud headers".  The remainder of the physical
+ * page is divided up into aligned 64-byte "chunks" which contain
+ * the compressed data for zero, one, or two zbuds.  Each zbpg
+ * resides on: (1) an "unused list" if it has no zbuds; (2) a
+ * "buddied" list if it is fully populated  with two zbuds; or
+ * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
+ * the one unbuddied zbud uses.  The data inside a zbpg cannot be
+ * read or written unless the zbpg's lock is held.
+ */
+
+#define ZBH_SENTINEL  0x43214321
+#define ZBPG_SENTINEL  0xdeadbeef
+
+#define ZBUD_MAX_BUDS 2
+
+struct zbud_hdr {
+	uint32_t pool_id;
+	struct tmem_oid oid;
+	uint32_t index;
+	uint16_t size; /* compressed size in bytes, zero means unused */
+	DECL_SENTINEL
+};
+
+struct zbud_page {
+	struct list_head bud_list;
+	spinlock_t lock;
+	struct zbud_hdr buddy[ZBUD_MAX_BUDS];
+	DECL_SENTINEL
+	/* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
+};
+
+#define CHUNK_SHIFT	6
+#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
+#define CHUNK_MASK	(~(CHUNK_SIZE-1))
+#define NCHUNKS		(((PAGE_SIZE - sizeof(struct zbud_page)) & \
+				CHUNK_MASK) >> CHUNK_SHIFT)
+#define MAX_CHUNK	(NCHUNKS-1)
+
+static struct {
+	struct list_head list;
+	unsigned count;
+} zbud_unbuddied[NCHUNKS];
+/* list N contains pages with N chunks USED and NCHUNKS-N unused */
+/* element 0 is never used but optimizing that isn't worth it */
+static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
+
+struct list_head zbud_buddied_list;
+static unsigned long zcache_zbud_buddied_count;
+
+/* protects the buddied list and all unbuddied lists */
+static DEFINE_SPINLOCK(zbud_budlists_spinlock);
+
+static LIST_HEAD(zbpg_unused_list);
+static unsigned long zcache_zbpg_unused_list_count;
+
+/* protects the unused page list */
+static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
+
+static atomic_t zcache_zbud_curr_raw_pages;
+static atomic_t zcache_zbud_curr_zpages;
+static unsigned long zcache_zbud_curr_zbytes;
+static unsigned long zcache_zbud_cumul_zpages;
+static unsigned long zcache_zbud_cumul_zbytes;
+static unsigned long zcache_compress_poor;
+
+/* forward references */
+static void *zcache_get_free_page(void);
+static void zcache_free_page(void *p);
+
+/*
+ * zbud helper functions
+ */
+
+static inline unsigned zbud_max_buddy_size(void)
+{
+	return MAX_CHUNK << CHUNK_SHIFT;
+}
+
+static inline unsigned zbud_size_to_chunks(unsigned size)
+{
+	BUG_ON(size == 0 || size > zbud_max_buddy_size());
+	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+static inline int zbud_budnum(struct zbud_hdr *zh)
+{
+	unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
+	struct zbud_page *zbpg = NULL;
+	unsigned budnum = -1U;
+	int i;
+
+	for (i = 0; i < ZBUD_MAX_BUDS; i++)
+		if (offset == offsetof(typeof(*zbpg), buddy[i])) {
+			budnum = i;
+			break;
+		}
+	BUG_ON(budnum == -1U);
+	return budnum;
+}
+
+static char *zbud_data(struct zbud_hdr *zh, unsigned size)
+{
+	struct zbud_page *zbpg;
+	char *p;
+	unsigned budnum;
+
+	ASSERT_SENTINEL(zh, ZBH);
+	budnum = zbud_budnum(zh);
+	BUG_ON(size == 0 || size > zbud_max_buddy_size());
+	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
+	ASSERT_SPINLOCK(&zbpg->lock);
+	p = (char *)zbpg;
+	if (budnum == 0)
+		p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
+							CHUNK_MASK);
+	else if (budnum == 1)
+		p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
+	return p;
+}
+
+/*
+ * zbud raw page management
+ */
+
+static struct zbud_page *zbud_alloc_raw_page(void)
+{
+	struct zbud_page *zbpg = NULL;
+	struct zbud_hdr *zh0, *zh1;
+	bool recycled = 0;
+
+	/* if any pages on the zbpg list, use one */
+	spin_lock(&zbpg_unused_list_spinlock);
+	if (!list_empty(&zbpg_unused_list)) {
+		zbpg = list_first_entry(&zbpg_unused_list,
+				struct zbud_page, bud_list);
+		list_del_init(&zbpg->bud_list);
+		zcache_zbpg_unused_list_count--;
+		recycled = 1;
+	}
+	spin_unlock(&zbpg_unused_list_spinlock);
+	if (zbpg == NULL)
+		/* none on zbpg list, try to get a kernel page */
+		zbpg = zcache_get_free_page();
+	if (likely(zbpg != NULL)) {
+		INIT_LIST_HEAD(&zbpg->bud_list);
+		zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
+		spin_lock_init(&zbpg->lock);
+		if (recycled) {
+			ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
+			SET_SENTINEL(zbpg, ZBPG);
+			BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
+			BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
+		} else {
+			atomic_inc(&zcache_zbud_curr_raw_pages);
+			INIT_LIST_HEAD(&zbpg->bud_list);
+			SET_SENTINEL(zbpg, ZBPG);
+			zh0->size = 0; zh1->size = 0;
+			tmem_oid_set_invalid(&zh0->oid);
+			tmem_oid_set_invalid(&zh1->oid);
+		}
+	}
+	return zbpg;
+}
+
+static void zbud_free_raw_page(struct zbud_page *zbpg)
+{
+	struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
+
+	ASSERT_SENTINEL(zbpg, ZBPG);
+	BUG_ON(!list_empty(&zbpg->bud_list));
+	ASSERT_SPINLOCK(&zbpg->lock);
+	BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
+	BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
+	INVERT_SENTINEL(zbpg, ZBPG);
+	spin_unlock(&zbpg->lock);
+	spin_lock(&zbpg_unused_list_spinlock);
+	list_add(&zbpg->bud_list, &zbpg_unused_list);
+	zcache_zbpg_unused_list_count++;
+	spin_unlock(&zbpg_unused_list_spinlock);
+}
+
+/*
+ * core zbud handling routines
+ */
+
+static unsigned zbud_free(struct zbud_hdr *zh)
+{
+	unsigned size;
+
+	ASSERT_SENTINEL(zh, ZBH);
+	BUG_ON(!tmem_oid_valid(&zh->oid));
+	size = zh->size;
+	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
+	zh->size = 0;
+	tmem_oid_set_invalid(&zh->oid);
+	INVERT_SENTINEL(zh, ZBH);
+	zcache_zbud_curr_zbytes -= size;
+	atomic_dec(&zcache_zbud_curr_zpages);
+	return size;
+}
+
+static void zbud_free_and_delist(struct zbud_hdr *zh)
+{
+	unsigned chunks;
+	struct zbud_hdr *zh_other;
+	unsigned budnum = zbud_budnum(zh), size;
+	struct zbud_page *zbpg =
+		container_of(zh, struct zbud_page, buddy[budnum]);
+
+	spin_lock(&zbpg->lock);
+	if (list_empty(&zbpg->bud_list)) {
+		/* ignore zombie page... see zbud_evict_pages() */
+		spin_unlock(&zbpg->lock);
+		return;
+	}
+	size = zbud_free(zh);
+	ASSERT_SPINLOCK(&zbpg->lock);
+	zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
+	if (zh_other->size == 0) { /* was unbuddied: unlist and free */
+		chunks = zbud_size_to_chunks(size) ;
+		spin_lock(&zbud_budlists_spinlock);
+		BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
+		list_del_init(&zbpg->bud_list);
+		zbud_unbuddied[chunks].count--;
+		spin_unlock(&zbud_budlists_spinlock);
+		zbud_free_raw_page(zbpg);
+	} else { /* was buddied: move remaining buddy to unbuddied list */
+		chunks = zbud_size_to_chunks(zh_other->size) ;
+		spin_lock(&zbud_budlists_spinlock);
+		list_del_init(&zbpg->bud_list);
+		zcache_zbud_buddied_count--;
+		list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
+		zbud_unbuddied[chunks].count++;
+		spin_unlock(&zbud_budlists_spinlock);
+		spin_unlock(&zbpg->lock);
+	}
+}
+
+static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
+					uint32_t index, struct page *page,
+					void *cdata, unsigned size)
+{
+	struct zbud_hdr *zh0, *zh1, *zh = NULL;
+	struct zbud_page *zbpg = NULL, *ztmp;
+	unsigned nchunks;
+	char *to;
+	int i, found_good_buddy = 0;
+
+	nchunks = zbud_size_to_chunks(size) ;
+	for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
+		spin_lock(&zbud_budlists_spinlock);
+		if (!list_empty(&zbud_unbuddied[i].list)) {
+			list_for_each_entry_safe(zbpg, ztmp,
+				    &zbud_unbuddied[i].list, bud_list) {
+				if (spin_trylock(&zbpg->lock)) {
+					found_good_buddy = i;
+					goto found_unbuddied;
+				}
+			}
+		}
+		spin_unlock(&zbud_budlists_spinlock);
+	}
+	/* didn't find a good buddy, try allocating a new page */
+	zbpg = zbud_alloc_raw_page();
+	if (unlikely(zbpg == NULL))
+		goto out;
+	/* ok, have a page, now compress the data before taking locks */
+	spin_lock(&zbpg->lock);
+	spin_lock(&zbud_budlists_spinlock);
+	list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
+	zbud_unbuddied[nchunks].count++;
+	zh = &zbpg->buddy[0];
+	goto init_zh;
+
+found_unbuddied:
+	ASSERT_SPINLOCK(&zbpg->lock);
+	zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
+	BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
+	if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
+		ASSERT_SENTINEL(zh0, ZBH);
+		zh = zh1;
+	} else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
+		ASSERT_SENTINEL(zh1, ZBH);
+		zh = zh0;
+	} else
+		BUG();
+	list_del_init(&zbpg->bud_list);
+	zbud_unbuddied[found_good_buddy].count--;
+	list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
+	zcache_zbud_buddied_count++;
+
+init_zh:
+	SET_SENTINEL(zh, ZBH);
+	zh->size = size;
+	zh->index = index;
+	zh->oid = *oid;
+	zh->pool_id = pool_id;
+	/* can wait to copy the data until the list locks are dropped */
+	spin_unlock(&zbud_budlists_spinlock);
+
+	to = zbud_data(zh, size);
+	memcpy(to, cdata, size);
+	spin_unlock(&zbpg->lock);
+	zbud_cumul_chunk_counts[nchunks]++;
+	atomic_inc(&zcache_zbud_curr_zpages);
+	zcache_zbud_cumul_zpages++;
+	zcache_zbud_curr_zbytes += size;
+	zcache_zbud_cumul_zbytes += size;
+out:
+	return zh;
+}
+
+static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
+{
+	struct zbud_page *zbpg;
+	unsigned budnum = zbud_budnum(zh);
+	size_t out_len = PAGE_SIZE;
+	char *to_va, *from_va;
+	unsigned size;
+	int ret = 0;
+
+	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
+	spin_lock(&zbpg->lock);
+	if (list_empty(&zbpg->bud_list)) {
+		/* ignore zombie page... see zbud_evict_pages() */
+		ret = -EINVAL;
+		goto out;
+	}
+	ASSERT_SENTINEL(zh, ZBH);
+	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
+	to_va = kmap_atomic(page, KM_USER0);
+	size = zh->size;
+	from_va = zbud_data(zh, size);
+	ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
+	BUG_ON(ret != LZO_E_OK);
+	BUG_ON(out_len != PAGE_SIZE);
+	kunmap_atomic(to_va, KM_USER0);
+out:
+	spin_unlock(&zbpg->lock);
+	return ret;
+}
+
+/*
+ * The following routines handle shrinking of ephemeral pages by evicting
+ * pages "least valuable" first.
+ */
+
+static unsigned long zcache_evicted_raw_pages;
+static unsigned long zcache_evicted_buddied_pages;
+static unsigned long zcache_evicted_unbuddied_pages;
+
+static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
+static void zcache_put_pool(struct tmem_pool *pool);
+
+/*
+ * Flush and free all zbuds in a zbpg, then free the pageframe
+ */
+static void zbud_evict_zbpg(struct zbud_page *zbpg)
+{
+	struct zbud_hdr *zh;
+	int i, j;
+	uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
+	struct tmem_oid oid[ZBUD_MAX_BUDS];
+	struct tmem_pool *pool;
+
+	ASSERT_SPINLOCK(&zbpg->lock);
+	BUG_ON(!list_empty(&zbpg->bud_list));
+	for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
+		zh = &zbpg->buddy[i];
+		if (zh->size) {
+			pool_id[j] = zh->pool_id;
+			oid[j] = zh->oid;
+			index[j] = zh->index;
+			j++;
+			zbud_free(zh);
+		}
+	}
+	spin_unlock(&zbpg->lock);
+	for (i = 0; i < j; i++) {
+		pool = zcache_get_pool_by_id(pool_id[i]);
+		if (pool != NULL) {
+			tmem_flush_page(pool, &oid[i], index[i]);
+			zcache_put_pool(pool);
+		}
+	}
+	ASSERT_SENTINEL(zbpg, ZBPG);
+	spin_lock(&zbpg->lock);
+	zbud_free_raw_page(zbpg);
+}
+
+/*
+ * Free nr pages.  This code is funky because we want to hold the locks
+ * protecting various lists for as short a time as possible, and in some
+ * circumstances the list may change asynchronously when the list lock is
+ * not held.  In some cases we also trylock not only to avoid waiting on a
+ * page in use by another cpu, but also to avoid potential deadlock due to
+ * lock inversion.
+ */
+static void zbud_evict_pages(int nr)
+{
+	struct zbud_page *zbpg;
+	int i;
+
+	/* first try freeing any pages on unused list */
+retry_unused_list:
+	spin_lock_bh(&zbpg_unused_list_spinlock);
+	if (!list_empty(&zbpg_unused_list)) {
+		/* can't walk list here, since it may change when unlocked */
+		zbpg = list_first_entry(&zbpg_unused_list,
+				struct zbud_page, bud_list);
+		list_del_init(&zbpg->bud_list);
+		zcache_zbpg_unused_list_count--;
+		atomic_dec(&zcache_zbud_curr_raw_pages);
+		spin_unlock_bh(&zbpg_unused_list_spinlock);
+		zcache_free_page(zbpg);
+		zcache_evicted_raw_pages++;
+		if (--nr <= 0)
+			goto out;
+		goto retry_unused_list;
+	}
+	spin_unlock_bh(&zbpg_unused_list_spinlock);
+
+	/* now try freeing unbuddied pages, starting with least space avail */
+	for (i = 0; i < MAX_CHUNK; i++) {
+retry_unbud_list_i:
+		spin_lock_bh(&zbud_budlists_spinlock);
+		if (list_empty(&zbud_unbuddied[i].list)) {
+			spin_unlock_bh(&zbud_budlists_spinlock);
+			continue;
+		}
+		list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
+			if (unlikely(!spin_trylock(&zbpg->lock)))
+				continue;
+			list_del_init(&zbpg->bud_list);
+			zbud_unbuddied[i].count--;
+			spin_unlock(&zbud_budlists_spinlock);
+			zcache_evicted_unbuddied_pages++;
+			/* want budlists unlocked when doing zbpg eviction */
+			zbud_evict_zbpg(zbpg);
+			local_bh_enable();
+			if (--nr <= 0)
+				goto out;
+			goto retry_unbud_list_i;
+		}
+		spin_unlock_bh(&zbud_budlists_spinlock);
+	}
+
+	/* as a last resort, free buddied pages */
+retry_bud_list:
+	spin_lock_bh(&zbud_budlists_spinlock);
+	if (list_empty(&zbud_buddied_list)) {
+		spin_unlock_bh(&zbud_budlists_spinlock);
+		goto out;
+	}
+	list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
+		if (unlikely(!spin_trylock(&zbpg->lock)))
+			continue;
+		list_del_init(&zbpg->bud_list);
+		zcache_zbud_buddied_count--;
+		spin_unlock(&zbud_budlists_spinlock);
+		zcache_evicted_buddied_pages++;
+		/* want budlists unlocked when doing zbpg eviction */
+		zbud_evict_zbpg(zbpg);
+		local_bh_enable();
+		if (--nr <= 0)
+			goto out;
+		goto retry_bud_list;
+	}
+	spin_unlock_bh(&zbud_budlists_spinlock);
+out:
+	return;
+}
+
+static void zbud_init(void)
+{
+	int i;
+
+	INIT_LIST_HEAD(&zbud_buddied_list);
+	zcache_zbud_buddied_count = 0;
+	for (i = 0; i < NCHUNKS; i++) {
+		INIT_LIST_HEAD(&zbud_unbuddied[i].list);
+		zbud_unbuddied[i].count = 0;
+	}
+}
+
+#ifdef CONFIG_SYSFS
+/*
+ * These sysfs routines show a nice distribution of how many zbpg's are
+ * currently (and have ever been placed) in each unbuddied list.  It's fun
+ * to watch but can probably go away before final merge.
+ */
+static int zbud_show_unbuddied_list_counts(char *buf)
+{
+	int i;
+	char *p = buf;
+
+	for (i = 0; i < NCHUNKS - 1; i++)
+		p += sprintf(p, "%u ", zbud_unbuddied[i].count);
+	p += sprintf(p, "%d\n", zbud_unbuddied[i].count);
+	return p - buf;
+}
+
+static int zbud_show_cumul_chunk_counts(char *buf)
+{
+	unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
+	unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
+	unsigned long total_chunks_lte_42 = 0;
+	char *p = buf;
+
+	for (i = 0; i < NCHUNKS; i++) {
+		p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
+		chunks += zbud_cumul_chunk_counts[i];
+		total_chunks += zbud_cumul_chunk_counts[i];
+		sum_total_chunks += i * zbud_cumul_chunk_counts[i];
+		if (i == 21)
+			total_chunks_lte_21 = total_chunks;
+		if (i == 32)
+			total_chunks_lte_32 = total_chunks;
+		if (i == 42)
+			total_chunks_lte_42 = total_chunks;
+	}
+	p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
+		total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
+		chunks == 0 ? 0 : sum_total_chunks / chunks);
+	return p - buf;
+}
+#endif
+
+/**********
+ * This "zv" PAM implementation combines the TLSF-based xvMalloc
+ * with lzo1x compression to maximize the amount of data that can
+ * be packed into a physical page.
+ *
+ * Zv represents a PAM page with the index and object (plus a "size" value
+ * necessary for decompression) immediately preceding the compressed data.
+ */
+
+#define ZVH_SENTINEL  0x43214321
+
+struct zv_hdr {
+	uint32_t pool_id;
+	struct tmem_oid oid;
+	uint32_t index;
+	DECL_SENTINEL
+};
+
+static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
+
+static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
+				struct tmem_oid *oid, uint32_t index,
+				void *cdata, unsigned clen)
+{
+	struct page *page;
+	struct zv_hdr *zv = NULL;
+	uint32_t offset;
+	int ret;
+
+	BUG_ON(!irqs_disabled());
+	ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
+			&page, &offset, ZCACHE_GFP_MASK);
+	if (unlikely(ret))
+		goto out;
+	zv = kmap_atomic(page, KM_USER0) + offset;
+	zv->index = index;
+	zv->oid = *oid;
+	zv->pool_id = pool_id;
+	SET_SENTINEL(zv, ZVH);
+	memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
+	kunmap_atomic(zv, KM_USER0);
+out:
+	return zv;
+}
+
+static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
+{
+	unsigned long flags;
+	struct page *page;
+	uint32_t offset;
+	uint16_t size;
+
+	ASSERT_SENTINEL(zv, ZVH);
+	size = xv_get_object_size(zv) - sizeof(*zv);
+	BUG_ON(size == 0 || size > zv_max_page_size);
+	INVERT_SENTINEL(zv, ZVH);
+	page = virt_to_page(zv);
+	offset = (unsigned long)zv & ~PAGE_MASK;
+	local_irq_save(flags);
+	xv_free(xvpool, page, offset);
+	local_irq_restore(flags);
+}
+
+static void zv_decompress(struct page *page, struct zv_hdr *zv)
+{
+	size_t clen = PAGE_SIZE;
+	char *to_va;
+	unsigned size;
+	int ret;
+
+	ASSERT_SENTINEL(zv, ZVH);
+	size = xv_get_object_size(zv) - sizeof(*zv);
+	BUG_ON(size == 0 || size > zv_max_page_size);
+	to_va = kmap_atomic(page, KM_USER0);
+	ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
+					size, to_va, &clen);
+	kunmap_atomic(to_va, KM_USER0);
+	BUG_ON(ret != LZO_E_OK);
+	BUG_ON(clen != PAGE_SIZE);
+}
+
+/*
+ * zcache core code starts here
+ */
+
+/* useful stats not collected by cleancache or frontswap */
+static unsigned long zcache_flush_total;
+static unsigned long zcache_flush_found;
+static unsigned long zcache_flobj_total;
+static unsigned long zcache_flobj_found;
+static unsigned long zcache_failed_eph_puts;
+static unsigned long zcache_failed_pers_puts;
+
+#define MAX_POOLS_PER_CLIENT 16
+
+static struct {
+	struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
+	struct xv_pool *xvpool;
+} zcache_client;
+
+/*
+ * Tmem operations assume the poolid implies the invoking client.
+ * Zcache only has one client (the kernel itself), so translate
+ * the poolid into the tmem_pool allocated for it.  A KVM version
+ * of zcache would have one client per guest and each client might
+ * have a poolid==N.
+ */
+static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
+{
+	struct tmem_pool *pool = NULL;
+
+	if (poolid >= 0) {
+		pool = zcache_client.tmem_pools[poolid];
+		if (pool != NULL)
+			atomic_inc(&pool->refcount);
+	}
+	return pool;
+}
+
+static void zcache_put_pool(struct tmem_pool *pool)
+{
+	if (pool != NULL)
+		atomic_dec(&pool->refcount);
+}
+
+/* counters for debugging */
+static unsigned long zcache_failed_get_free_pages;
+static unsigned long zcache_failed_alloc;
+static unsigned long zcache_put_to_flush;
+static unsigned long zcache_aborted_preload;
+static unsigned long zcache_aborted_shrink;
+
+/*
+ * Ensure that memory allocation requests in zcache don't result
+ * in direct reclaim requests via the shrinker, which would cause
+ * an infinite loop.  Maybe a GFP flag would be better?
+ */
+static DEFINE_SPINLOCK(zcache_direct_reclaim_lock);
+
+/*
+ * for now, used named slabs so can easily track usage; later can
+ * either just use kmalloc, or perhaps add a slab-like allocator
+ * to more carefully manage total memory utilization
+ */
+static struct kmem_cache *zcache_objnode_cache;
+static struct kmem_cache *zcache_obj_cache;
+static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
+static unsigned long zcache_curr_obj_count_max;
+static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
+static unsigned long zcache_curr_objnode_count_max;
+
+/*
+ * to avoid memory allocation recursion (e.g. due to direct reclaim), we
+ * preload all necessary data structures so the hostops callbacks never
+ * actually do a malloc
+ */
+struct zcache_preload {
+	void *page;
+	struct tmem_obj *obj;
+	int nr;
+	struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
+};
+static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
+
+static int zcache_do_preload(struct tmem_pool *pool)
+{
+	struct zcache_preload *kp;
+	struct tmem_objnode *objnode;
+	struct tmem_obj *obj;
+	void *page;
+	int ret = -ENOMEM;
+
+	if (unlikely(zcache_objnode_cache == NULL))
+		goto out;
+	if (unlikely(zcache_obj_cache == NULL))
+		goto out;
+	if (!spin_trylock(&zcache_direct_reclaim_lock)) {
+		zcache_aborted_preload++;
+		goto out;
+	}
+	preempt_disable();
+	kp = &__get_cpu_var(zcache_preloads);
+	while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
+		preempt_enable_no_resched();
+		objnode = kmem_cache_alloc(zcache_objnode_cache,
+				ZCACHE_GFP_MASK);
+		if (unlikely(objnode == NULL)) {
+			zcache_failed_alloc++;
+			goto unlock_out;
+		}
+		preempt_disable();
+		kp = &__get_cpu_var(zcache_preloads);
+		if (kp->nr < ARRAY_SIZE(kp->objnodes))
+			kp->objnodes[kp->nr++] = objnode;
+		else
+			kmem_cache_free(zcache_objnode_cache, objnode);
+	}
+	preempt_enable_no_resched();
+	obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
+	if (unlikely(obj == NULL)) {
+		zcache_failed_alloc++;
+		goto unlock_out;
+	}
+	page = (void *)__get_free_page(ZCACHE_GFP_MASK);
+	if (unlikely(page == NULL)) {
+		zcache_failed_get_free_pages++;
+		goto unlock_out;
+	}
+	preempt_disable();
+	kp = &__get_cpu_var(zcache_preloads);
+	if (kp->obj == NULL)
+		kp->obj = obj;
+	else
+		kmem_cache_free(zcache_obj_cache, obj);
+	if (kp->page == NULL)
+		kp->page = page;
+	else
+		free_page((unsigned long)page);
+	ret = 0;
+unlock_out:
+	spin_unlock(&zcache_direct_reclaim_lock);
+out:
+	return ret;
+}
+
+static void *zcache_get_free_page(void)
+{
+	struct zcache_preload *kp;
+	void *page;
+
+	kp = &__get_cpu_var(zcache_preloads);
+	page = kp->page;
+	BUG_ON(page == NULL);
+	kp->page = NULL;
+	return page;
+}
+
+static void zcache_free_page(void *p)
+{
+	free_page((unsigned long)p);
+}
+
+/*
+ * zcache implementation for tmem host ops
+ */
+
+static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
+{
+	struct tmem_objnode *objnode = NULL;
+	unsigned long count;
+	struct zcache_preload *kp;
+
+	kp = &__get_cpu_var(zcache_preloads);
+	if (kp->nr <= 0)
+		goto out;
+	objnode = kp->objnodes[kp->nr - 1];
+	BUG_ON(objnode == NULL);
+	kp->objnodes[kp->nr - 1] = NULL;
+	kp->nr--;
+	count = atomic_inc_return(&zcache_curr_objnode_count);
+	if (count > zcache_curr_objnode_count_max)
+		zcache_curr_objnode_count_max = count;
+out:
+	return objnode;
+}
+
+static void zcache_objnode_free(struct tmem_objnode *objnode,
+					struct tmem_pool *pool)
+{
+	atomic_dec(&zcache_curr_objnode_count);
+	BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
+	kmem_cache_free(zcache_objnode_cache, objnode);
+}
+
+static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
+{
+	struct tmem_obj *obj = NULL;
+	unsigned long count;
+	struct zcache_preload *kp;
+
+	kp = &__get_cpu_var(zcache_preloads);
+	obj = kp->obj;
+	BUG_ON(obj == NULL);
+	kp->obj = NULL;
+	count = atomic_inc_return(&zcache_curr_obj_count);
+	if (count > zcache_curr_obj_count_max)
+		zcache_curr_obj_count_max = count;
+	return obj;
+}
+
+static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
+{
+	atomic_dec(&zcache_curr_obj_count);
+	BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
+	kmem_cache_free(zcache_obj_cache, obj);
+}
+
+static struct tmem_hostops zcache_hostops = {
+	.obj_alloc = zcache_obj_alloc,
+	.obj_free = zcache_obj_free,
+	.objnode_alloc = zcache_objnode_alloc,
+	.objnode_free = zcache_objnode_free,
+};
+
+/*
+ * zcache implementations for PAM page descriptor ops
+ */
+
+static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
+static unsigned long zcache_curr_eph_pampd_count_max;
+static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
+static unsigned long zcache_curr_pers_pampd_count_max;
+
+/* forward reference */
+static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
+
+static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
+				 uint32_t index, struct page *page)
+{
+	void *pampd = NULL, *cdata;
+	size_t clen;
+	int ret;
+	bool ephemeral = is_ephemeral(pool);
+	unsigned long count;
+
+	if (ephemeral) {
+		ret = zcache_compress(page, &cdata, &clen);
+		if (ret == 0)
+
+			goto out;
+		if (clen == 0 || clen > zbud_max_buddy_size()) {
+			zcache_compress_poor++;
+			goto out;
+		}
+		pampd = (void *)zbud_create(pool->pool_id, oid, index,
+						page, cdata, clen);
+		if (pampd != NULL) {
+			count = atomic_inc_return(&zcache_curr_eph_pampd_count);
+			if (count > zcache_curr_eph_pampd_count_max)
+				zcache_curr_eph_pampd_count_max = count;
+		}
+	} else {
+		/*
+		 * FIXME: This is all the "policy" there is for now.
+		 * 3/4 totpages should allow ~37% of RAM to be filled with
+		 * compressed frontswap pages
+		 */
+		if (atomic_read(&zcache_curr_pers_pampd_count) >
+							3 * totalram_pages / 4)
+			goto out;
+		ret = zcache_compress(page, &cdata, &clen);
+		if (ret == 0)
+			goto out;
+		if (clen > zv_max_page_size) {
+			zcache_compress_poor++;
+			goto out;
+		}
+		pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
+						oid, index, cdata, clen);
+		if (pampd == NULL)
+			goto out;
+		count = atomic_inc_return(&zcache_curr_pers_pampd_count);
+		if (count > zcache_curr_pers_pampd_count_max)
+			zcache_curr_pers_pampd_count_max = count;
+	}
+out:
+	return pampd;
+}
+
+/*
+ * fill the pageframe corresponding to the struct page with the data
+ * from the passed pampd
+ */
+static int zcache_pampd_get_data(struct page *page, void *pampd,
+						struct tmem_pool *pool)
+{
+	int ret = 0;
+
+	if (is_ephemeral(pool))
+		ret = zbud_decompress(page, pampd);
+	else
+		zv_decompress(page, pampd);
+	return ret;
+}
+
+/*
+ * free the pampd and remove it from any zcache lists
+ * pampd must no longer be pointed to from any tmem data structures!
+ */
+static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
+{
+	if (is_ephemeral(pool)) {
+		zbud_free_and_delist((struct zbud_hdr *)pampd);
+		atomic_dec(&zcache_curr_eph_pampd_count);
+		BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
+	} else {
+		zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
+		atomic_dec(&zcache_curr_pers_pampd_count);
+		BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
+	}
+}
+
+static struct tmem_pamops zcache_pamops = {
+	.create = zcache_pampd_create,
+	.get_data = zcache_pampd_get_data,
+	.free = zcache_pampd_free,
+};
+
+/*
+ * zcache compression/decompression and related per-cpu stuff
+ */
+
+#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
+#define LZO_DSTMEM_PAGE_ORDER 1
+static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
+static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
+
+static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
+{
+	int ret = 0;
+	unsigned char *dmem = __get_cpu_var(zcache_dstmem);
+	unsigned char *wmem = __get_cpu_var(zcache_workmem);
+	char *from_va;
+
+	BUG_ON(!irqs_disabled());
+	if (unlikely(dmem == NULL || wmem == NULL))
+		goto out;  /* no buffer, so can't compress */
+	from_va = kmap_atomic(from, KM_USER0);
+	mb();
+	ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
+	BUG_ON(ret != LZO_E_OK);
+	*out_va = dmem;
+	kunmap_atomic(from_va, KM_USER0);
+	ret = 1;
+out:
+	return ret;
+}
+
+
+static int zcache_cpu_notifier(struct notifier_block *nb,
+				unsigned long action, void *pcpu)
+{
+	int cpu = (long)pcpu;
+	struct zcache_preload *kp;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
+			GFP_KERNEL | __GFP_REPEAT,
+			LZO_DSTMEM_PAGE_ORDER),
+		per_cpu(zcache_workmem, cpu) =
+			kzalloc(LZO1X_MEM_COMPRESS,
+				GFP_KERNEL | __GFP_REPEAT);
+		break;
+	case CPU_DEAD:
+	case CPU_UP_CANCELED:
+		free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
+				LZO_DSTMEM_PAGE_ORDER);
+		per_cpu(zcache_dstmem, cpu) = NULL;
+		kfree(per_cpu(zcache_workmem, cpu));
+		per_cpu(zcache_workmem, cpu) = NULL;
+		kp = &per_cpu(zcache_preloads, cpu);
+		while (kp->nr) {
+			kmem_cache_free(zcache_objnode_cache,
+					kp->objnodes[kp->nr - 1]);
+			kp->objnodes[kp->nr - 1] = NULL;
+			kp->nr--;
+		}
+		kmem_cache_free(zcache_obj_cache, kp->obj);
+		free_page((unsigned long)kp->page);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block zcache_cpu_notifier_block = {
+	.notifier_call = zcache_cpu_notifier
+};
+
+#ifdef CONFIG_SYSFS
+#define ZCACHE_SYSFS_RO(_name) \
+	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
+				struct kobj_attribute *attr, char *buf) \
+	{ \
+		return sprintf(buf, "%lu\n", zcache_##_name); \
+	} \
+	static struct kobj_attribute zcache_##_name##_attr = { \
+		.attr = { .name = __stringify(_name), .mode = 0444 }, \
+		.show = zcache_##_name##_show, \
+	}
+
+#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
+	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
+				struct kobj_attribute *attr, char *buf) \
+	{ \
+	    return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
+	} \
+	static struct kobj_attribute zcache_##_name##_attr = { \
+		.attr = { .name = __stringify(_name), .mode = 0444 }, \
+		.show = zcache_##_name##_show, \
+	}
+
+#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
+	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
+				struct kobj_attribute *attr, char *buf) \
+	{ \
+	    return _func(buf); \
+	} \
+	static struct kobj_attribute zcache_##_name##_attr = { \
+		.attr = { .name = __stringify(_name), .mode = 0444 }, \
+		.show = zcache_##_name##_show, \
+	}
+
+ZCACHE_SYSFS_RO(curr_obj_count_max);
+ZCACHE_SYSFS_RO(curr_objnode_count_max);
+ZCACHE_SYSFS_RO(flush_total);
+ZCACHE_SYSFS_RO(flush_found);
+ZCACHE_SYSFS_RO(flobj_total);
+ZCACHE_SYSFS_RO(flobj_found);
+ZCACHE_SYSFS_RO(failed_eph_puts);
+ZCACHE_SYSFS_RO(failed_pers_puts);
+ZCACHE_SYSFS_RO(zbud_curr_zbytes);
+ZCACHE_SYSFS_RO(zbud_cumul_zpages);
+ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
+ZCACHE_SYSFS_RO(zbud_buddied_count);
+ZCACHE_SYSFS_RO(zbpg_unused_list_count);
+ZCACHE_SYSFS_RO(evicted_raw_pages);
+ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
+ZCACHE_SYSFS_RO(evicted_buddied_pages);
+ZCACHE_SYSFS_RO(failed_get_free_pages);
+ZCACHE_SYSFS_RO(failed_alloc);
+ZCACHE_SYSFS_RO(put_to_flush);
+ZCACHE_SYSFS_RO(aborted_preload);
+ZCACHE_SYSFS_RO(aborted_shrink);
+ZCACHE_SYSFS_RO(compress_poor);
+ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
+ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
+ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
+ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
+ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
+			zbud_show_unbuddied_list_counts);
+ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
+			zbud_show_cumul_chunk_counts);
+
+static struct attribute *zcache_attrs[] = {
+	&zcache_curr_obj_count_attr.attr,
+	&zcache_curr_obj_count_max_attr.attr,
+	&zcache_curr_objnode_count_attr.attr,
+	&zcache_curr_objnode_count_max_attr.attr,
+	&zcache_flush_total_attr.attr,
+	&zcache_flobj_total_attr.attr,
+	&zcache_flush_found_attr.attr,
+	&zcache_flobj_found_attr.attr,
+	&zcache_failed_eph_puts_attr.attr,
+	&zcache_failed_pers_puts_attr.attr,
+	&zcache_compress_poor_attr.attr,
+	&zcache_zbud_curr_raw_pages_attr.attr,
+	&zcache_zbud_curr_zpages_attr.attr,
+	&zcache_zbud_curr_zbytes_attr.attr,
+	&zcache_zbud_cumul_zpages_attr.attr,
+	&zcache_zbud_cumul_zbytes_attr.attr,
+	&zcache_zbud_buddied_count_attr.attr,
+	&zcache_zbpg_unused_list_count_attr.attr,
+	&zcache_evicted_raw_pages_attr.attr,
+	&zcache_evicted_unbuddied_pages_attr.attr,
+	&zcache_evicted_buddied_pages_attr.attr,
+	&zcache_failed_get_free_pages_attr.attr,
+	&zcache_failed_alloc_attr.attr,
+	&zcache_put_to_flush_attr.attr,
+	&zcache_aborted_preload_attr.attr,
+	&zcache_aborted_shrink_attr.attr,
+	&zcache_zbud_unbuddied_list_counts_attr.attr,
+	&zcache_zbud_cumul_chunk_counts_attr.attr,
+	NULL,
+};
+
+static struct attribute_group zcache_attr_group = {
+	.attrs = zcache_attrs,
+	.name = "zcache",
+};
+
+#endif /* CONFIG_SYSFS */
+/*
+ * When zcache is disabled ("frozen"), pools can be created and destroyed,
+ * but all puts (and thus all other operations that require memory allocation)
+ * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
+ * data consistency requires all puts while frozen to be converted into
+ * flushes.
+ */
+static bool zcache_freeze;
+
+/*
+ * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
+ */
+static int shrink_zcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+{
+	int ret = -1;
+
+	if (nr >= 0) {
+		if (!(gfp_mask & __GFP_FS))
+			/* does this case really need to be skipped? */
+			goto out;
+		if (spin_trylock(&zcache_direct_reclaim_lock)) {
+			zbud_evict_pages(nr);
+			spin_unlock(&zcache_direct_reclaim_lock);
+		} else
+			zcache_aborted_shrink++;
+	}
+	ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
+out:
+	return ret;
+}
+
+static struct shrinker zcache_shrinker = {
+	.shrink = shrink_zcache_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
+/*
+ * zcache shims between cleancache/frontswap ops and tmem
+ */
+
+static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
+				uint32_t index, struct page *page)
+{
+	struct tmem_pool *pool;
+	int ret = -1;
+
+	BUG_ON(!irqs_disabled());
+	pool = zcache_get_pool_by_id(pool_id);
+	if (unlikely(pool == NULL))
+		goto out;
+	if (!zcache_freeze && zcache_do_preload(pool) == 0) {
+		/* preload does preempt_disable on success */
+		ret = tmem_put(pool, oidp, index, page);
+		if (ret < 0) {
+			if (is_ephemeral(pool))
+				zcache_failed_eph_puts++;
+			else
+				zcache_failed_pers_puts++;
+		}
+		zcache_put_pool(pool);
+		preempt_enable_no_resched();
+	} else {
+		zcache_put_to_flush++;
+		if (atomic_read(&pool->obj_count) > 0)
+			/* the put fails whether the flush succeeds or not */
+			(void)tmem_flush_page(pool, oidp, index);
+		zcache_put_pool(pool);
+	}
+out:
+	return ret;
+}
+
+static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
+				uint32_t index, struct page *page)
+{
+	struct tmem_pool *pool;
+	int ret = -1;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pool = zcache_get_pool_by_id(pool_id);
+	if (likely(pool != NULL)) {
+		if (atomic_read(&pool->obj_count) > 0)
+			ret = tmem_get(pool, oidp, index, page);
+		zcache_put_pool(pool);
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
+{
+	struct tmem_pool *pool;
+	int ret = -1;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	zcache_flush_total++;
+	pool = zcache_get_pool_by_id(pool_id);
+	if (likely(pool != NULL)) {
+		if (atomic_read(&pool->obj_count) > 0)
+			ret = tmem_flush_page(pool, oidp, index);
+		zcache_put_pool(pool);
+	}
+	if (ret >= 0)
+		zcache_flush_found++;
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
+{
+	struct tmem_pool *pool;
+	int ret = -1;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	zcache_flobj_total++;
+	pool = zcache_get_pool_by_id(pool_id);
+	if (likely(pool != NULL)) {
+		if (atomic_read(&pool->obj_count) > 0)
+			ret = tmem_flush_object(pool, oidp);
+		zcache_put_pool(pool);
+	}
+	if (ret >= 0)
+		zcache_flobj_found++;
+	local_irq_restore(flags);
+	return ret;
+}
+
+static int zcache_destroy_pool(int pool_id)
+{
+	struct tmem_pool *pool = NULL;
+	int ret = -1;
+
+	if (pool_id < 0)
+		goto out;
+	pool = zcache_client.tmem_pools[pool_id];
+	if (pool == NULL)
+		goto out;
+	zcache_client.tmem_pools[pool_id] = NULL;
+	/* wait for pool activity on other cpus to quiesce */
+	while (atomic_read(&pool->refcount) != 0)
+		;
+	local_bh_disable();
+	ret = tmem_destroy_pool(pool);
+	local_bh_enable();
+	kfree(pool);
+	pr_info("zcache: destroyed pool id=%d\n", pool_id);
+out:
+	return ret;
+}
+
+static int zcache_new_pool(uint32_t flags)
+{
+	int poolid = -1;
+	struct tmem_pool *pool;
+
+	pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
+	if (pool == NULL) {
+		pr_info("zcache: pool creation failed: out of memory\n");
+		goto out;
+	}
+
+	for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
+		if (zcache_client.tmem_pools[poolid] == NULL)
+			break;
+	if (poolid >= MAX_POOLS_PER_CLIENT) {
+		pr_info("zcache: pool creation failed: max exceeded\n");
+		kfree(pool);
+		poolid = -1;
+		goto out;
+	}
+	atomic_set(&pool->refcount, 0);
+	pool->client = &zcache_client;
+	pool->pool_id = poolid;
+	tmem_new_pool(pool, flags);
+	zcache_client.tmem_pools[poolid] = pool;
+	pr_info("zcache: created %s tmem pool, id=%d\n",
+		flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
+		poolid);
+out:
+	return poolid;
+}
+
+/**********
+ * Two kernel functionalities currently can be layered on top of tmem.
+ * These are "cleancache" which is used as a second-chance cache for clean
+ * page cache pages; and "frontswap" which is used for swap pages
+ * to avoid writes to disk.  A generic "shim" is provided here for each
+ * to translate in-kernel semantics to zcache semantics.
+ */
+
+#ifdef CONFIG_CLEANCACHE
+static void zcache_cleancache_put_page(int pool_id,
+					struct cleancache_filekey key,
+					pgoff_t index, struct page *page)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	if (likely(ind == index))
+		(void)zcache_put_page(pool_id, &oid, index, page);
+}
+
+static int zcache_cleancache_get_page(int pool_id,
+					struct cleancache_filekey key,
+					pgoff_t index, struct page *page)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+	int ret = -1;
+
+	if (likely(ind == index))
+		ret = zcache_get_page(pool_id, &oid, index, page);
+	return ret;
+}
+
+static void zcache_cleancache_flush_page(int pool_id,
+					struct cleancache_filekey key,
+					pgoff_t index)
+{
+	u32 ind = (u32) index;
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	if (likely(ind == index))
+		(void)zcache_flush_page(pool_id, &oid, ind);
+}
+
+static void zcache_cleancache_flush_inode(int pool_id,
+					struct cleancache_filekey key)
+{
+	struct tmem_oid oid = *(struct tmem_oid *)&key;
+
+	(void)zcache_flush_object(pool_id, &oid);
+}
+
+static void zcache_cleancache_flush_fs(int pool_id)
+{
+	if (pool_id >= 0)
+		(void)zcache_destroy_pool(pool_id);
+}
+
+static int zcache_cleancache_init_fs(size_t pagesize)
+{
+	BUG_ON(sizeof(struct cleancache_filekey) !=
+				sizeof(struct tmem_oid));
+	BUG_ON(pagesize != PAGE_SIZE);
+	return zcache_new_pool(0);
+}
+
+static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
+{
+	/* shared pools are unsupported and map to private */
+	BUG_ON(sizeof(struct cleancache_filekey) !=
+				sizeof(struct tmem_oid));
+	BUG_ON(pagesize != PAGE_SIZE);
+	return zcache_new_pool(0);
+}
+
+static struct cleancache_ops zcache_cleancache_ops = {
+	.put_page = zcache_cleancache_put_page,
+	.get_page = zcache_cleancache_get_page,
+	.flush_page = zcache_cleancache_flush_page,
+	.flush_inode = zcache_cleancache_flush_inode,
+	.flush_fs = zcache_cleancache_flush_fs,
+	.init_shared_fs = zcache_cleancache_init_shared_fs,
+	.init_fs = zcache_cleancache_init_fs
+};
+
+struct cleancache_ops zcache_cleancache_register_ops(void)
+{
+	struct cleancache_ops old_ops =
+		cleancache_register_ops(&zcache_cleancache_ops);
+
+	return old_ops;
+}
+#endif
+
+#ifdef CONFIG_FRONTSWAP
+/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+static int zcache_frontswap_poolid = -1;
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS		4
+#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
+#define _oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind)		(_ind >> SWIZ_BITS)
+
+static inline struct tmem_oid oswiz(unsigned type, u32 ind)
+{
+	struct tmem_oid oid = { .oid = { 0 } };
+	oid.oid[0] = _oswiz(type, ind);
+	return oid;
+}
+
+static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
+				   struct page *page)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	struct tmem_oid oid = oswiz(type, ind);
+	int ret = -1;
+	unsigned long flags;
+
+	BUG_ON(!PageLocked(page));
+	if (likely(ind64 == ind)) {
+		local_irq_save(flags);
+		ret = zcache_put_page(zcache_frontswap_poolid, &oid,
+					iswiz(ind), page);
+		local_irq_restore(flags);
+	}
+	return ret;
+}
+
+/* returns 0 if the page was successfully gotten from frontswap, -1 if
+ * was not present (should never happen!) */
+static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
+				   struct page *page)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	struct tmem_oid oid = oswiz(type, ind);
+	int ret = -1;
+
+	BUG_ON(!PageLocked(page));
+	if (likely(ind64 == ind))
+		ret = zcache_get_page(zcache_frontswap_poolid, &oid,
+					iswiz(ind), page);
+	return ret;
+}
+
+/* flush a single page from frontswap */
+static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	struct tmem_oid oid = oswiz(type, ind);
+
+	if (likely(ind64 == ind))
+		(void)zcache_flush_page(zcache_frontswap_poolid, &oid,
+					iswiz(ind));
+}
+
+/* flush all pages from the passed swaptype */
+static void zcache_frontswap_flush_area(unsigned type)
+{
+	struct tmem_oid oid;
+	int ind;
+
+	for (ind = SWIZ_MASK; ind >= 0; ind--) {
+		oid = oswiz(type, ind);
+		(void)zcache_flush_object(zcache_frontswap_poolid, &oid);
+	}
+}
+
+static void zcache_frontswap_init(unsigned ignored)
+{
+	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
+	if (zcache_frontswap_poolid < 0)
+		zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
+}
+
+static struct frontswap_ops zcache_frontswap_ops = {
+	.put_page = zcache_frontswap_put_page,
+	.get_page = zcache_frontswap_get_page,
+	.flush_page = zcache_frontswap_flush_page,
+	.flush_area = zcache_frontswap_flush_area,
+	.init = zcache_frontswap_init
+};
+
+struct frontswap_ops zcache_frontswap_register_ops(void)
+{
+	struct frontswap_ops old_ops =
+		frontswap_register_ops(&zcache_frontswap_ops);
+
+	return old_ops;
+}
+#endif
+
+/*
+ * zcache initialization
+ * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
+ * NOTHING HAPPENS!
+ */
+
+static int zcache_enabled;
+
+static int __init enable_zcache(char *s)
+{
+	zcache_enabled = 1;
+	return 1;
+}
+__setup("zcache", enable_zcache);
+
+/* allow independent dynamic disabling of cleancache and frontswap */
+
+static int use_cleancache = 1;
+
+static int __init no_cleancache(char *s)
+{
+	use_cleancache = 0;
+	return 1;
+}
+
+__setup("nocleancache", no_cleancache);
+
+static int use_frontswap = 1;
+
+static int __init no_frontswap(char *s)
+{
+	use_frontswap = 0;
+	return 1;
+}
+
+__setup("nofrontswap", no_frontswap);
+
+static int __init zcache_init(void)
+{
+#ifdef CONFIG_SYSFS
+	int ret = 0;
+
+	ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
+	if (ret) {
+		pr_err("zcache: can't create sysfs\n");
+		goto out;
+	}
+#endif /* CONFIG_SYSFS */
+#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
+	if (zcache_enabled) {
+		unsigned int cpu;
+
+		tmem_register_hostops(&zcache_hostops);
+		tmem_register_pamops(&zcache_pamops);
+		ret = register_cpu_notifier(&zcache_cpu_notifier_block);
+		if (ret) {
+			pr_err("zcache: can't register cpu notifier\n");
+			goto out;
+		}
+		for_each_online_cpu(cpu) {
+			void *pcpu = (void *)(long)cpu;
+			zcache_cpu_notifier(&zcache_cpu_notifier_block,
+				CPU_UP_PREPARE, pcpu);
+		}
+	}
+	zcache_objnode_cache = kmem_cache_create("zcache_objnode",
+				sizeof(struct tmem_objnode), 0, 0, NULL);
+	zcache_obj_cache = kmem_cache_create("zcache_obj",
+				sizeof(struct tmem_obj), 0, 0, NULL);
+#endif
+#ifdef CONFIG_CLEANCACHE
+	if (zcache_enabled && use_cleancache) {
+		struct cleancache_ops old_ops;
+
+		zbud_init();
+		register_shrinker(&zcache_shrinker);
+		old_ops = zcache_cleancache_register_ops();
+		pr_info("zcache: cleancache enabled using kernel "
+			"transcendent memory and compression buddies\n");
+		if (old_ops.init_fs != NULL)
+			pr_warning("zcache: cleancache_ops overridden");
+	}
+#endif
+#ifdef CONFIG_FRONTSWAP
+	if (zcache_enabled && use_frontswap) {
+		struct frontswap_ops old_ops;
+
+		zcache_client.xvpool = xv_create_pool();
+		if (zcache_client.xvpool == NULL) {
+			pr_err("zcache: can't create xvpool\n");
+			goto out;
+		}
+		old_ops = zcache_frontswap_register_ops();
+		pr_info("zcache: frontswap enabled using kernel "
+			"transcendent memory and xvmalloc\n");
+		if (old_ops.init != NULL)
+			pr_warning("ktmem: frontswap_ops overridden");
+	}
+#endif
+out:
+	return ret;
+}
+
+module_init(zcache_init)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-07  3:26 [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services Dan Magenheimer
@ 2011-02-08 22:48 ` Minchan Kim
  2011-02-08 23:27   ` Dan Magenheimer
  2011-02-15 16:53 ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 13+ messages in thread
From: Minchan Kim @ 2011-02-08 22:48 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: gregkh, chris.mason, akpm, torvalds, matthew, linux-kernel,
	linux-mm, ngupta, jeremy, kurt.hackel, npiggin, riel,
	konrad.wilk, mel, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes

On Mon, Feb 7, 2011 at 12:26 PM, Dan Magenheimer
<dan.magenheimer@oracle.com> wrote:
> [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
>
> Zcache provides host services (memory allocation) for tmem,
> a "shim" to interface cleancache and frontswap to tmem, and
> two different page-addressable memory implemenations using
> lzo1x compression.  The first, "compression buddies" ("zbud")
> compresses pairs of pages and supplies a shrinker interface
> that allows entire pages to be reclaimed.  The second is
> a shim to xvMalloc which is more space-efficient but
> less receptive to page reclamation.  The first is used
> for ephemeral pools and the second for persistent pools.
> All ephemeral pools share the same memory, that is, even
> pages from different pools can share the same page.
>
> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> Signed-off-by: Nitin Gupta <ngupta@vflare.org>

Hi Dan,
First of all, thanks for endless effort.

I didn't look at code entirely but it seems this series includes frontswap.
Finally frontswap is to replace zram?

If it is right, how about approaching one by one for easy review and merging?
I mean firstly we replace zram into frontswap and tmem and then zcache.
Regardless of my suggestion, I will look at the this series in my spare time.

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-08 22:48 ` Minchan Kim
@ 2011-02-08 23:27   ` Dan Magenheimer
  2011-02-08 23:56     ` Minchan Kim
  0 siblings, 1 reply; 13+ messages in thread
From: Dan Magenheimer @ 2011-02-08 23:27 UTC (permalink / raw)
  To: Minchan Kim
  Cc: gregkh, Chris Mason, akpm, torvalds, matthew, linux-kernel,
	linux-mm, ngupta, jeremy, Kurt Hackel, npiggin, riel,
	Konrad Wilk, mel, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes

Hi Minchan --

> First of all, thanks for endless effort.

Sometimes it does seem endless ;-)
 
> I didn't look at code entirely but it seems this series includes
> frontswap.

The "new zcache" optionally depends on frontswap, but frontswap is
a separate patchset.  If the frontswap patchset is present
and configured, zcache will use it to dynamically compress swap pages.
If frontswap is not present or not configured, zcache will only
use cleancache to dynamically compress clean page cache pages.
For best results, both frontswap and cleancache should be enabled.
(and see the link in PATCH V2 0/3 for a monolithic patch against
2.6.37 that enabled both).

> Finally frontswap is to replace zram?

Nitin and I have agreed that, for now, both frontswap and zram
should continue to exist.  They have similar functionality but
different use models.  Over time we will see if they can be merged.

Nitin and I agreed offlist that the following summarizes the
differences between zram and frontswap:

===========

Zram uses an asynchronous model (e.g. uses the block I/O subsystem)
and requires a device to be explicitly created.  When used for
swap, mkswap creates a fixed-size swap device (usually with higher
priority than any disk-based swap device) and zram is filled
until it is full, at which point other lower-priority (disk-based)
swap devices are then used.  So zram is well-suited for a fixed-
size-RAM machine with a known workload where an administrator
can pre-configure a zram device to improve RAM efficiency during
peak memory load.

Frontswap uses a synchronous model, circumventing the block I/O
subsystem.  The frontswap "device" is completely dynamic in size,
e.g. frontswap is queried for every individual page-to-be-swapped
and, if rejected, the page is swapped to the "real" swap device.
So frontswap is well-suited for highly dynamic conditions where
workload is unpredictable and/or RAM size may "vary" due to
circumstances not entirely within the kernel's control.

==========

Does that make sense?

> Regardless of my suggestion, I will look at the this series in my spare
> time.

Thanks, we are looking forward to your always excellent and
thorough review!

Dan


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-08 23:27   ` Dan Magenheimer
@ 2011-02-08 23:56     ` Minchan Kim
  2011-02-09 16:39       ` Dan Magenheimer
  0 siblings, 1 reply; 13+ messages in thread
From: Minchan Kim @ 2011-02-08 23:56 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: gregkh, Chris Mason, akpm, torvalds, matthew, linux-kernel,
	linux-mm, ngupta, jeremy, Kurt Hackel, npiggin, riel,
	Konrad Wilk, mel, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes

On Wed, Feb 9, 2011 at 8:27 AM, Dan Magenheimer
<dan.magenheimer@oracle.com> wrote:
> Hi Minchan --
>
>> First of all, thanks for endless effort.
>
> Sometimes it does seem endless ;-)
>
>> I didn't look at code entirely but it seems this series includes
>> frontswap.
>
> The "new zcache" optionally depends on frontswap, but frontswap is
> a separate patchset.  If the frontswap patchset is present
> and configured, zcache will use it to dynamically compress swap pages.
> If frontswap is not present or not configured, zcache will only
> use cleancache to dynamically compress clean page cache pages.
> For best results, both frontswap and cleancache should be enabled.
> (and see the link in PATCH V2 0/3 for a monolithic patch against
> 2.6.37 that enabled both).
>
>> Finally frontswap is to replace zram?
>
> Nitin and I have agreed that, for now, both frontswap and zram
> should continue to exist.  They have similar functionality but
> different use models.  Over time we will see if they can be merged.
>
> Nitin and I agreed offlist that the following summarizes the
> differences between zram and frontswap:
>
> ===========
>
> Zram uses an asynchronous model (e.g. uses the block I/O subsystem)
> and requires a device to be explicitly created.  When used for
> swap, mkswap creates a fixed-size swap device (usually with higher
> priority than any disk-based swap device) and zram is filled
> until it is full, at which point other lower-priority (disk-based)
> swap devices are then used.  So zram is well-suited for a fixed-
> size-RAM machine with a known workload where an administrator
> can pre-configure a zram device to improve RAM efficiency during
> peak memory load.
>
> Frontswap uses a synchronous model, circumventing the block I/O
> subsystem.  The frontswap "device" is completely dynamic in size,
> e.g. frontswap is queried for every individual page-to-be-swapped
> and, if rejected, the page is swapped to the "real" swap device.
> So frontswap is well-suited for highly dynamic conditions where
> workload is unpredictable and/or RAM size may "vary" due to
> circumstances not entirely within the kernel's control.
>
> ==========
>
> Does that make sense?

Thanks for the quick reply.
As I read your comment, I can't find the benefit of zram compared to frontswap.

1. asynchronous model
2. usability
3. adaptive dynamic ram size

If I consider your statement, with 2, 3, zram isn't better than
fronswap, I think.
1 on zram may be good than frontswap but I doubt how much we have a
big benefit on async operation in ramdisk model.

If we have a big overhead of block stuff in such a model, couldn't we
remove the overhead generally?

What I can think of benefit is that zram export interface to block
device so someone can use compressed block device.
Block device interface exporting is enough to live zram in there?

Maybe I miss something of zram's benefits.
At least, I can't convince why zram and frontswap should coexist.
AFAIK, Nitin and you discussed it many times long time ago but I
didn't follow up it.  Sorry if I am missing something.

Thanks.


-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 13+ messages in thread

* RE: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-08 23:56     ` Minchan Kim
@ 2011-02-09 16:39       ` Dan Magenheimer
  2011-02-09 17:36         ` Nitin Gupta
  2011-02-09 23:57         ` Minchan Kim
  0 siblings, 2 replies; 13+ messages in thread
From: Dan Magenheimer @ 2011-02-09 16:39 UTC (permalink / raw)
  To: Minchan Kim
  Cc: gregkh, Chris Mason, akpm, torvalds, matthew, linux-kernel,
	linux-mm, ngupta, jeremy, Kurt Hackel, npiggin, riel,
	Konrad Wilk, mel, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes



> From: Minchan Kim [mailto:minchan.kim@gmail.com]

> As I read your comment, I can't find the benefit of zram compared to
> frontswap.

Well, I am biased, but I agree that frontswap is a better technical
solution than zram. ;-)  But "dynamic-ity" is very important to
me and may be less important to others.

I thought of these other differences, both technical and
non-technical:

- Zram is minimally invasive to the swap subsystem, requiring only
  one hook which is already upstream (though see below) and is
  apparently already used by some Linux users.  Frontswap is somewhat
  more invasive and, UNTIL zcache-was-kztmem was posted a few weeks
  ago, had no non-Xen users (though some distros are already shipping
  the hooks in their kernels because Xen supports it); as a result,
  frontswap has gotten almost no review by kernel swap subsystem
  experts who I'm guessing weren't interested in anything that
  required Xen to use... hopefully that barrier is now resolved
  (but bottom line is frontswap is not yet upstream).

- Zram has one-byte of overhead per page in every explicitly configured
  zram swap, the same as any real swap device.  Frontswap has one-BIT
  of overhead per page for every configured (real) swap device.

- Frontswap requires several hooks scattered through the swap subsystem:
  a) init, put, get, flush, and destroy
  b) a bit-per-page map to record whether a swapped page is in
     frontswap or on the real device
  c) a "partial swapoff" to suck stale pages out of frontswap
  Zram's one flush hook is upstream, though IMHO to be fully functional
  in the real world, it needs some form of (c) also.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-09 16:39       ` Dan Magenheimer
@ 2011-02-09 17:36         ` Nitin Gupta
  2011-02-09 23:46           ` Minchan Kim
  2011-02-09 23:57         ` Minchan Kim
  1 sibling, 1 reply; 13+ messages in thread
From: Nitin Gupta @ 2011-02-09 17:36 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Minchan Kim, gregkh, Chris Mason, akpm, torvalds, matthew,
	linux-kernel, linux-mm, jeremy, Kurt Hackel, npiggin, riel,
	Konrad Wilk, mel, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes

On 02/09/2011 11:39 AM, Dan Magenheimer wrote:
>
>
>> From: Minchan Kim [mailto:minchan.kim@gmail.com]
>
>> As I read your comment, I can't find the benefit of zram compared to
>> frontswap.
>
> Well, I am biased, but I agree that frontswap is a better technical
> solution than zram. ;-)  But "dynamic-ity" is very important to
> me and may be less important to others.
>


I agree that frontswap is better than zram when considering swap as the 
use case - no bio overhead, dynamic resizing. However, zram being a 
*generic* block-device has some unique cases too like hosting files on 
/tmp, various caches under /var or any place where a compressed 
in-memory block device can help.

So, frontswap and zram have overlapping use case of swap but are not the 
same.

Thanks,
Nitin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-09 17:36         ` Nitin Gupta
@ 2011-02-09 23:46           ` Minchan Kim
  2011-02-10  1:17             ` Nitin Gupta
  0 siblings, 1 reply; 13+ messages in thread
From: Minchan Kim @ 2011-02-09 23:46 UTC (permalink / raw)
  To: Nitin Gupta
  Cc: Dan Magenheimer, gregkh, Chris Mason, akpm, torvalds, matthew,
	linux-kernel, linux-mm, jeremy, Kurt Hackel, npiggin, riel,
	Konrad Wilk, mel, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes

Hi Nitin,

Sorry for late response.

On Thu, Feb 10, 2011 at 2:36 AM, Nitin Gupta <ngupta@vflare.org> wrote:
> On 02/09/2011 11:39 AM, Dan Magenheimer wrote:
>>
>>
>>> From: Minchan Kim [mailto:minchan.kim@gmail.com]
>>
>>> As I read your comment, I can't find the benefit of zram compared to
>>> frontswap.
>>
>> Well, I am biased, but I agree that frontswap is a better technical
>> solution than zram. ;-)  But "dynamic-ity" is very important to
>> me and may be less important to others.
>>
>
>
> I agree that frontswap is better than zram when considering swap as the use
> case - no bio overhead, dynamic resizing. However, zram being a *generic*
> block-device has some unique cases too like hosting files on /tmp, various
> caches under /var or any place where a compressed in-memory block device can
> help.

Yes. I mentioned that benefit but I am not sure the reason is enough.
What I had in mind long time ago is that zram's functionality into brd.
So someone who want to compress contents could use it with some mount
option to enable compression.
By such way, many ramdisk user can turn it on easily.
If many user begin using the brd, we can see many report about
performance then solve brd performance s as well as zcache world-wide
usage.

Hmm,  the idea is too late?

>
> So, frontswap and zram have overlapping use case of swap but are not the
> same.

If we can insert zram's functionality into brd, maybe there is no
reason to coexist.


>
> Thanks,
> Nitin
>



-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-09 16:39       ` Dan Magenheimer
  2011-02-09 17:36         ` Nitin Gupta
@ 2011-02-09 23:57         ` Minchan Kim
  2011-02-09 23:58           ` Minchan Kim
  1 sibling, 1 reply; 13+ messages in thread
From: Minchan Kim @ 2011-02-09 23:57 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: gregkh, Chris Mason, akpm, torvalds, matthew, linux-kernel,
	linux-mm, ngupta, jeremy, Kurt Hackel, npiggin, riel,
	Konrad Wilk, mel, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes

On Thu, Feb 10, 2011 at 1:39 AM, Dan Magenheimer
<dan.magenheimer@oracle.com> wrote:
>
>
>> From: Minchan Kim [mailto:minchan.kim@gmail.com]
>
>> As I read your comment, I can't find the benefit of zram compared to
>> frontswap.
>
> Well, I am biased, but I agree that frontswap is a better technical
> solution than zram. ;-)  But "dynamic-ity" is very important to
> me and may be less important to others.
>
> I thought of these other differences, both technical and
> non-technical:
>
> - Zram is minimally invasive to the swap subsystem, requiring only
>  one hook which is already upstream (though see below) and is
>  apparently already used by some Linux users.  Frontswap is somewhat

Yes. I think what someone is using it is a problem.

>  more invasive and, UNTIL zcache-was-kztmem was posted a few weeks
>  ago, had no non-Xen users (though some distros are already shipping
>  the hooks in their kernels because Xen supports it); as a result,
>  frontswap has gotten almost no review by kernel swap subsystem
>  experts who I'm guessing weren't interested in anything that
>  required Xen to use... hopefully that barrier is now resolved
>  (but bottom line is frontswap is not yet upstream).

That's why I suggested to remove frontswap in this turn.
If any swap experts has a interest, maybe you can't receive any ack or
review about the part in this series. Maybe  maintainers ends up
hesitating the merge.

If zcache except frontswap is merged into mainline or receive enough
review, then you can try merging frontswap as further step.

Thanks.

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-09 23:57         ` Minchan Kim
@ 2011-02-09 23:58           ` Minchan Kim
  0 siblings, 0 replies; 13+ messages in thread
From: Minchan Kim @ 2011-02-09 23:58 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: gregkh, Chris Mason, akpm, torvalds, matthew, linux-kernel,
	linux-mm, ngupta, jeremy, Kurt Hackel, npiggin, riel,
	Konrad Wilk, mel, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes

On Thu, Feb 10, 2011 at 8:57 AM, Minchan Kim <minchan.kim@gmail.com> wrote:
> On Thu, Feb 10, 2011 at 1:39 AM, Dan Magenheimer
> <dan.magenheimer@oracle.com> wrote:
>>
>>
>>> From: Minchan Kim [mailto:minchan.kim@gmail.com]
>>
>>> As I read your comment, I can't find the benefit of zram compared to
>>> frontswap.
>>
>> Well, I am biased, but I agree that frontswap is a better technical
>> solution than zram. ;-)  But "dynamic-ity" is very important to
>> me and may be less important to others.
>>
>> I thought of these other differences, both technical and
>> non-technical:
>>
>> - Zram is minimally invasive to the swap subsystem, requiring only
>>  one hook which is already upstream (though see below) and is
>>  apparently already used by some Linux users.  Frontswap is somewhat
>
> Yes. I think what someone is using it is a problem.
>
>>  more invasive and, UNTIL zcache-was-kztmem was posted a few weeks
>>  ago, had no non-Xen users (though some distros are already shipping
>>  the hooks in their kernels because Xen supports it); as a result,
>>  frontswap has gotten almost no review by kernel swap subsystem
>>  experts who I'm guessing weren't interested in anything that
>>  required Xen to use... hopefully that barrier is now resolved
>>  (but bottom line is frontswap is not yet upstream).
>
> That's why I suggested to remove frontswap in this turn.
> If any swap experts has a interest, maybe you can't receive any ack or

Typo.
If any swap experts don't have a interest,


-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-09 23:46           ` Minchan Kim
@ 2011-02-10  1:17             ` Nitin Gupta
  0 siblings, 0 replies; 13+ messages in thread
From: Nitin Gupta @ 2011-02-10  1:17 UTC (permalink / raw)
  To: Minchan Kim
  Cc: Dan Magenheimer, gregkh, Chris Mason, akpm, torvalds, matthew,
	linux-kernel, linux-mm, jeremy, Kurt Hackel, npiggin, riel,
	Konrad Wilk, mel, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes

On 02/09/2011 06:46 PM, Minchan Kim wrote:
> Hi Nitin,
>
> Sorry for late response.
>
> On Thu, Feb 10, 2011 at 2:36 AM, Nitin Gupta<ngupta@vflare.org>  wrote:
>> On 02/09/2011 11:39 AM, Dan Magenheimer wrote:
>>>
>>>
>>>> From: Minchan Kim [mailto:minchan.kim@gmail.com]
>>>
>>>> As I read your comment, I can't find the benefit of zram compared to
>>>> frontswap.
>>>
>>> Well, I am biased, but I agree that frontswap is a better technical
>>> solution than zram. ;-)  But "dynamic-ity" is very important to
>>> me and may be less important to others.
>>>
>>
>>
>> I agree that frontswap is better than zram when considering swap as the use
>> case - no bio overhead, dynamic resizing. However, zram being a *generic*
>> block-device has some unique cases too like hosting files on /tmp, various
>> caches under /var or any place where a compressed in-memory block device can
>> help.
>
> Yes. I mentioned that benefit but I am not sure the reason is enough.
> What I had in mind long time ago is that zram's functionality into brd.
> So someone who want to compress contents could use it with some mount
> option to enable compression.
> By such way, many ramdisk user can turn it on easily.
> If many user begin using the brd, we can see many report about
> performance then solve brd performance s as well as zcache world-wide
> usage.
>
> Hmm,  the idea is too late?
>
>>
>> So, frontswap and zram have overlapping use case of swap but are not the
>> same.
>
> If we can insert zram's functionality into brd, maybe there is no
> reason to coexist.
>
>
>>

I thought about this before starting with zram development but thought 
adding compression (and in future, defrag) and use of custom allocator 
is just too much of a hassle and thus dropped the idea. If someone is 
anyhow interested in merging brd and zram I would be glad to help but I 
still think that is simply not worth the effort.

Thanks,
Nitin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-07  3:26 [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services Dan Magenheimer
  2011-02-08 22:48 ` Minchan Kim
@ 2011-02-15 16:53 ` Konrad Rzeszutek Wilk
  2011-02-15 17:25   ` Greg KH
  1 sibling, 1 reply; 13+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-02-15 16:53 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: gregkh, chris.mason, akpm, torvalds, matthew, linux-kernel,
	linux-mm, ngupta, jeremy, kurt.hackel, npiggin, riel, mel,
	minchan.kim, kosaki.motohiro, sfr, wfg, tytso, viro, hughd,
	hannes

On Sun, Feb 06, 2011 at 07:26:08PM -0800, Dan Magenheimer wrote:
> [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services

Hey Dan,

I did a simple review of just reading the code and trying to grok it.

Greg,

How does the review work in staging tree? Would you just take
new patches from Dan based on my review and he should stick
'Reviewed-by: Konrad...' or .. ?

> 
> Zcache provides host services (memory allocation) for tmem,
> a "shim" to interface cleancache and frontswap to tmem, and
> two different page-addressable memory implemenations using
> lzo1x compression.  The first, "compression buddies" ("zbud")
> compresses pairs of pages and supplies a shrinker interface
> that allows entire pages to be reclaimed.  The second is
> a shim to xvMalloc which is more space-efficient but
> less receptive to page reclamation.  The first is used
> for ephemeral pools and the second for persistent pools.
> All ephemeral pools share the same memory, that is, even
> pages from different pools can share the same page.

Could you seperate out the zcache cleancache, zcache frontswap
and zcache sysfs implementation in different files? Or is that
not a particular good idea?

A nice little ASCII digram, similar to this:


 [ext4] -> cleancache                 [swap] ->frontswap
       \                                /
        \                              /
         \--------zcache  ------------/
                    |
                    |
                [tmem]
                    +
                   / \
                 /    \
          [zbud for]  [xv for]
          [lighttype]  [persistent]
          [pages]     [pages] 

could also help.

> 
> Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
> Signed-off-by: Nitin Gupta <ngupta@vflare.org>
> 
> ---
> 
> Diffstat:
>  drivers/staging/zcache/zcache.c          | 1657 +++++++++++++++++++++
>  1 file changed, 1657 insertions(+)
> 
> --- linux-2.6.37/drivers/staging/zcache/zcache.c	1969-12-31 17:00:00.000000000 -0700
> +++ linux-2.6.37-zcache/drivers/staging/zcache/zcache.c	2011-02-05 15:52:26.000000000 -0700
> @@ -0,0 +1,1657 @@
> +/*
> + * zcache.c
> + *
> + * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.

Take out the ',' after 2011.

> + * Copyright (c) 2010,2011, Nitin Gupta
Ditto.

> + *
> + * Zcache provides an in-kernel "host implementation" for transcendent memory

Does 'z' stand for anything?
> + * and, thus indirectly, for cleancache and frontswap.  Zcache includes two
> + * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
> + * 1) "compression buddies" ("zbud") is used for ephemeral pages
> + * 2) xvmalloc is used for persistent pages.
> + * Xvmalloc (based on the TLSF allocator) has very low fragmentation
> + * so maximizes space efficiency, while zbud allows pairs (and potentially,
> + * in the future, more than a pair of) compressed pages to be closely linked
> + * so that reclaiming can be done via the kernel's physical-page-oriented
> + * "shrinker" interface.
> + *
> + * [1] For a definition of page-accessible memory (aka PAM), see:
> + *   http://marc.info/?l=linux-mm&m=127811271605009
> + */
> +
> +#include <linux/cpu.h>
> +#include <linux/highmem.h>
> +#include <linux/list.h>
> +#include <linux/lzo.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/types.h>
> +#include <linux/atomic.h>
> +#include "tmem.h"
> +
> +#include "../zram/xvmalloc.h" /* if built in drivers/staging */
> +
> +#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
> +#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
> +#endif
> +#ifdef CONFIG_CLEANCACHE
> +#include <linux/cleancache.h>
> +#endif
> +#ifdef CONFIG_FRONTSWAP
> +#include <linux/frontswap.h>
> +#endif
> +
> +#if 0
> +/* this is more aggressive but may cause other problems? */
> +#define ZCACHE_GFP_MASK	(GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
> +#else
> +#define ZCACHE_GFP_MASK \
> +	(__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
> +#endif
> +
> +/**********
> + * Compression buddies ("zbud") provides for packing two (or, possibly
> + * in the future, more) compressed ephemeral pages into a single "raw"

Why did you pick two in the initial patch? Was that a tradeoff between
optimization and wanting to have the data structures to be easy to understand?

Or is it just v1 and later might get tailored for better workloads?

> + * (physical) page and tracking them with data structures so that
> + * the raw pages can be easily reclaimed.
> + *
> + * A zbud page ("zbpg") is an aligned page containing a list_head,
> + * a lock, and two "zbud headers".  The remainder of the physical
> + * page is divided up into aligned 64-byte "chunks" which contain
> + * the compressed data for zero, one, or two zbuds.  Each zbpg

How are those chunks setup ? Are they stripped (so chunk#1 has
compressed data for #1 zbud, chunk #2 has compressed data for #2 zbud,
and so on)?

[edit: Looks like there are at maximum two swaths of compressed
data, where the #1 chunk is at the beginning (past the headers)
and the #2 chunk is aligned with the end of the page (and 64-byte
aligned), right?]

> + * resides on: (1) an "unused list" if it has no zbuds; (2) a
> + * "buddied" list if it is fully populated  with two zbuds; or
> + * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks

Uh, why PAGE_SIZE/64, and not PAGE_SIZE/sizeof(struct zbud compressed data)

If it is an indexed list, isn't that an array?

Maybe: "(3) an 64~ (determined by how many struct zbug_page we can stick
in a PAGE) array. The entry on the array is a list struct with a counter.
The index value determines how many unused chunks are in the zbud_page chunks.
Each zbud_page is linked depending on category of unused chunks on the
appropiate list."

That however starts to sound to much..

> + * the one unbuddied zbud uses.  The data inside a zbpg cannot be
> + * read or written unless the zbpg's lock is held.
> + */
> +
> +#define ZBH_SENTINEL  0x43214321
> +#define ZBPG_SENTINEL  0xdeadbeef
> +
> +#define ZBUD_MAX_BUDS 2

Might want to mention that just changing this to 3 won't actually work.
You need to make the augment the code that finds the proper header
to work with more than two pages of compressed data.

> +
> +struct zbud_hdr {
> +	uint32_t pool_id;
> +	struct tmem_oid oid;
> +	uint32_t index;
> +	uint16_t size; /* compressed size in bytes, zero means unused */
> +	DECL_SENTINEL
> +};
> +
> +struct zbud_page {
> +	struct list_head bud_list;
> +	spinlock_t lock;
> +	struct zbud_hdr buddy[ZBUD_MAX_BUDS];
> +	DECL_SENTINEL
> +	/* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
> +};
> +
> +#define CHUNK_SHIFT	6
> +#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
> +#define CHUNK_MASK	(~(CHUNK_SIZE-1))
> +#define NCHUNKS		(((PAGE_SIZE - sizeof(struct zbud_page)) & \
> +				CHUNK_MASK) >> CHUNK_SHIFT)
> +#define MAX_CHUNK	(NCHUNKS-1)
> +
> +static struct {
> +	struct list_head list;
> +	unsigned count;
> +} zbud_unbuddied[NCHUNKS];

zbud translates to 'z compression buddies', so when I read that
I think "z compression buddies unbuddied' How about 'zbud_lonely'??
Or 'zbud_single' ?


> +/* list N contains pages with N chunks USED and NCHUNKS-N unused */
> +/* element 0 is never used but optimizing that isn't worth it */
> +static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
> +
> +struct list_head zbud_buddied_list;

Ditto. "z compression buddied buddied list". How about just
'zbud_list' ?

> +static unsigned long zcache_zbud_buddied_count;

And here too, take out the 'zbud' out?

> +
> +/* protects the buddied list and all unbuddied lists */
> +static DEFINE_SPINLOCK(zbud_budlists_spinlock);
> +
> +static LIST_HEAD(zbpg_unused_list);
> +static unsigned long zcache_zbpg_unused_list_count;
> +
> +/* protects the unused page list */
> +static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
> +
> +static atomic_t zcache_zbud_curr_raw_pages;
> +static atomic_t zcache_zbud_curr_zpages;

That's a lot of Z's.. :-)

You are really catering this patch to Eastern Europeans with
'z's in their name, aren't you :-)

> +static unsigned long zcache_zbud_curr_zbytes;
> +static unsigned long zcache_zbud_cumul_zpages;

Now you are just having fun with those z's.

> +static unsigned long zcache_zbud_cumul_zbytes;

Heheh.
> +static unsigned long zcache_compress_poor;

Ah, what is that? Sounds like a watermark value?
Don't see it actually being exported in the sysfs
or debugfs.

> +
> +/* forward references */
> +static void *zcache_get_free_page(void);
> +static void zcache_free_page(void *p);
> +
> +/*
> + * zbud helper functions
> + */
> +
> +static inline unsigned zbud_max_buddy_size(void)
> +{
> +	return MAX_CHUNK << CHUNK_SHIFT;
> +}
> +
> +static inline unsigned zbud_size_to_chunks(unsigned size)
> +{
> +	BUG_ON(size == 0 || size > zbud_max_buddy_size());

You do a check for this in 'zcache_pampd_create' so this looks
uneccessary?
> +	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
> +}
> +
> +static inline int zbud_budnum(struct zbud_hdr *zh)
> +{
> +	unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
> +	struct zbud_page *zbpg = NULL;
> +	unsigned budnum = -1U;
> +	int i;
> +
> +	for (i = 0; i < ZBUD_MAX_BUDS; i++)
> +		if (offset == offsetof(typeof(*zbpg), buddy[i])) {
> +			budnum = i;
> +			break;
> +		}
> +	BUG_ON(budnum == -1U);
> +	return budnum;
> +}
> +
> +static char *zbud_data(struct zbud_hdr *zh, unsigned size)
> +{
> +	struct zbud_page *zbpg;
> +	char *p;
> +	unsigned budnum;
> +
> +	ASSERT_SENTINEL(zh, ZBH);
> +	budnum = zbud_budnum(zh);
> +	BUG_ON(size == 0 || size > zbud_max_buddy_size());

Ditto. zcache_pampd_create already checks for this.

> +	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
> +	ASSERT_SPINLOCK(&zbpg->lock);
> +	p = (char *)zbpg;
> +	if (budnum == 0)
> +		p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
> +							CHUNK_MASK);
> +	else if (budnum == 1)
> +		p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
> +	return p;
> +}
> +
> +/*
> + * zbud raw page management
> + */
> +
> +static struct zbud_page *zbud_alloc_raw_page(void)
> +{
> +	struct zbud_page *zbpg = NULL;
> +	struct zbud_hdr *zh0, *zh1;
> +	bool recycled = 0;
> +
> +	/* if any pages on the zbpg list, use one */
> +	spin_lock(&zbpg_unused_list_spinlock);
> +	if (!list_empty(&zbpg_unused_list)) {
> +		zbpg = list_first_entry(&zbpg_unused_list,
> +				struct zbud_page, bud_list);
> +		list_del_init(&zbpg->bud_list);
> +		zcache_zbpg_unused_list_count--;
> +		recycled = 1;
> +	}
> +	spin_unlock(&zbpg_unused_list_spinlock);
> +	if (zbpg == NULL)
> +		/* none on zbpg list, try to get a kernel page */
> +		zbpg = zcache_get_free_page();
> +	if (likely(zbpg != NULL)) {
> +		INIT_LIST_HEAD(&zbpg->bud_list);
> +		zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
> +		spin_lock_init(&zbpg->lock);
> +		if (recycled) {
> +			ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
> +			SET_SENTINEL(zbpg, ZBPG);
> +			BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
> +			BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
> +		} else {
> +			atomic_inc(&zcache_zbud_curr_raw_pages);
> +			INIT_LIST_HEAD(&zbpg->bud_list);

I think you are doing this the second time?

> +			SET_SENTINEL(zbpg, ZBPG);
> +			zh0->size = 0; zh1->size = 0;
> +			tmem_oid_set_invalid(&zh0->oid);
> +			tmem_oid_set_invalid(&zh1->oid);
> +		}
> +	}
> +	return zbpg;
> +}
> +
> +static void zbud_free_raw_page(struct zbud_page *zbpg)

You aren't actually free-ing the raw page here. You are
just moving it to the unused_list. Perhaps a better name
for the function is 'zbud_move_to_unused' ?

> +{
> +	struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
> +
> +	ASSERT_SENTINEL(zbpg, ZBPG);
> +	BUG_ON(!list_empty(&zbpg->bud_list));
> +	ASSERT_SPINLOCK(&zbpg->lock);
> +	BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
> +	BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
> +	INVERT_SENTINEL(zbpg, ZBPG);
> +	spin_unlock(&zbpg->lock);
> +	spin_lock(&zbpg_unused_list_spinlock);
> +	list_add(&zbpg->bud_list, &zbpg_unused_list);
> +	zcache_zbpg_unused_list_count++;
> +	spin_unlock(&zbpg_unused_list_spinlock);
> +}
> +
> +/*
> + * core zbud handling routines
> + */
> +
> +static unsigned zbud_free(struct zbud_hdr *zh)
> +{
> +	unsigned size;
> +
> +	ASSERT_SENTINEL(zh, ZBH);
> +	BUG_ON(!tmem_oid_valid(&zh->oid));
> +	size = zh->size;
> +	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
> +	zh->size = 0;
> +	tmem_oid_set_invalid(&zh->oid);
> +	INVERT_SENTINEL(zh, ZBH);
> +	zcache_zbud_curr_zbytes -= size;
> +	atomic_dec(&zcache_zbud_curr_zpages);
> +	return size;
> +}
> +
> +static void zbud_free_and_delist(struct zbud_hdr *zh)
> +{
> +	unsigned chunks;
> +	struct zbud_hdr *zh_other;
> +	unsigned budnum = zbud_budnum(zh), size;
> +	struct zbud_page *zbpg =
> +		container_of(zh, struct zbud_page, buddy[budnum]);
> +
> +	spin_lock(&zbpg->lock);
> +	if (list_empty(&zbpg->bud_list)) {
> +		/* ignore zombie page... see zbud_evict_pages() */
> +		spin_unlock(&zbpg->lock);
> +		return;
> +	}
> +	size = zbud_free(zh);
> +	ASSERT_SPINLOCK(&zbpg->lock);
> +	zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
> +	if (zh_other->size == 0) { /* was unbuddied: unlist and free */
> +		chunks = zbud_size_to_chunks(size) ;
> +		spin_lock(&zbud_budlists_spinlock);
> +		BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
> +		list_del_init(&zbpg->bud_list);
> +		zbud_unbuddied[chunks].count--;
> +		spin_unlock(&zbud_budlists_spinlock);
> +		zbud_free_raw_page(zbpg);
> +	} else { /* was buddied: move remaining buddy to unbuddied list */
> +		chunks = zbud_size_to_chunks(zh_other->size) ;
> +		spin_lock(&zbud_budlists_spinlock);
> +		list_del_init(&zbpg->bud_list);
> +		zcache_zbud_buddied_count--;
> +		list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
> +		zbud_unbuddied[chunks].count++;
> +		spin_unlock(&zbud_budlists_spinlock);
> +		spin_unlock(&zbpg->lock);
> +	}
> +}
> +
> +static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
> +					uint32_t index, struct page *page,
> +					void *cdata, unsigned size)
> +{
> +	struct zbud_hdr *zh0, *zh1, *zh = NULL;
> +	struct zbud_page *zbpg = NULL, *ztmp;
> +	unsigned nchunks;
> +	char *to;
> +	int i, found_good_buddy = 0;
> +
> +	nchunks = zbud_size_to_chunks(size) ;
> +	for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
> +		spin_lock(&zbud_budlists_spinlock);
> +		if (!list_empty(&zbud_unbuddied[i].list)) {
> +			list_for_each_entry_safe(zbpg, ztmp,
> +				    &zbud_unbuddied[i].list, bud_list) {
> +				if (spin_trylock(&zbpg->lock)) {
> +					found_good_buddy = i;
> +					goto found_unbuddied;
> +				}
> +			}
> +		}
> +		spin_unlock(&zbud_budlists_spinlock);
> +	}
> +	/* didn't find a good buddy, try allocating a new page */
> +	zbpg = zbud_alloc_raw_page();
> +	if (unlikely(zbpg == NULL))
> +		goto out;
> +	/* ok, have a page, now compress the data before taking locks */
> +	spin_lock(&zbpg->lock);
> +	spin_lock(&zbud_budlists_spinlock);
> +	list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
> +	zbud_unbuddied[nchunks].count++;
> +	zh = &zbpg->buddy[0];
> +	goto init_zh;
> +
> +found_unbuddied:
> +	ASSERT_SPINLOCK(&zbpg->lock);
> +	zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
> +	BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
> +	if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
> +		ASSERT_SENTINEL(zh0, ZBH);
> +		zh = zh1;
> +	} else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
> +		ASSERT_SENTINEL(zh1, ZBH);
> +		zh = zh0;
> +	} else
> +		BUG();
> +	list_del_init(&zbpg->bud_list);
> +	zbud_unbuddied[found_good_buddy].count--;
> +	list_add_tail(&zbpg->bud_list, &zbud_buddied_list);

OK, so we found an zbud_page with one header filed, we are going
to fill in the other (and also the data), so we move it
to the zbud_buddied_list.

And the 'zbud_data' later on figures out exactly where to stick
the compressed data. Is there any debugfs code (perhaps in the 
future?) to iterate over all zbud_buddied_list, and give the size of
compressed data? Not sure if that is actually useful in practice
but I am curious what is the average size of compressed pages?


> +	zcache_zbud_buddied_count++;
> +
> +init_zh:
> +	SET_SENTINEL(zh, ZBH);
> +	zh->size = size;
> +	zh->index = index;
> +	zh->oid = *oid;
> +	zh->pool_id = pool_id;
> +	/* can wait to copy the data until the list locks are dropped */
> +	spin_unlock(&zbud_budlists_spinlock);
> +
> +	to = zbud_data(zh, size);
> +	memcpy(to, cdata, size);

> +	spin_unlock(&zbpg->lock);
> +	zbud_cumul_chunk_counts[nchunks]++;
> +	atomic_inc(&zcache_zbud_curr_zpages);
> +	zcache_zbud_cumul_zpages++;
> +	zcache_zbud_curr_zbytes += size;
> +	zcache_zbud_cumul_zbytes += size;
> +out:
> +	return zh;
> +}
> +
> +static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
> +{
> +	struct zbud_page *zbpg;
> +	unsigned budnum = zbud_budnum(zh);
> +	size_t out_len = PAGE_SIZE;
> +	char *to_va, *from_va;
> +	unsigned size;
> +	int ret = 0;
> +
> +	zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
> +	spin_lock(&zbpg->lock);
> +	if (list_empty(&zbpg->bud_list)) {
> +		/* ignore zombie page... see zbud_evict_pages() */
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +	ASSERT_SENTINEL(zh, ZBH);
> +	BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());

Would that actually occur? That must be some really wacked zbud_page
to have that weird data - you are guarding against this in the
zcache_pampd_create when you check the clen.


> +	to_va = kmap_atomic(page, KM_USER0);
> +	size = zh->size;
> +	from_va = zbud_data(zh, size);
> +	ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
> +	BUG_ON(ret != LZO_E_OK);
> +	BUG_ON(out_len != PAGE_SIZE);
> +	kunmap_atomic(to_va, KM_USER0);
> +out:
> +	spin_unlock(&zbpg->lock);
> +	return ret;
> +}
> +
> +/*
> + * The following routines handle shrinking of ephemeral pages by evicting
> + * pages "least valuable" first.
> + */
> +
> +static unsigned long zcache_evicted_raw_pages;
> +static unsigned long zcache_evicted_buddied_pages;
> +static unsigned long zcache_evicted_unbuddied_pages;
> +
> +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
> +static void zcache_put_pool(struct tmem_pool *pool);
> +
> +/*
> + * Flush and free all zbuds in a zbpg, then free the pageframe
> + */
> +static void zbud_evict_zbpg(struct zbud_page *zbpg)
> +{
> +	struct zbud_hdr *zh;
> +	int i, j;
> +	uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
> +	struct tmem_oid oid[ZBUD_MAX_BUDS];
> +	struct tmem_pool *pool;
> +
> +	ASSERT_SPINLOCK(&zbpg->lock);
> +	BUG_ON(!list_empty(&zbpg->bud_list));
> +	for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
> +		zh = &zbpg->buddy[i];
> +		if (zh->size) {
> +			pool_id[j] = zh->pool_id;
> +			oid[j] = zh->oid;
> +			index[j] = zh->index;
> +			j++;
> +			zbud_free(zh);
> +		}
> +	}
> +	spin_unlock(&zbpg->lock);
> +	for (i = 0; i < j; i++) {
> +		pool = zcache_get_pool_by_id(pool_id[i]);
> +		if (pool != NULL) {
> +			tmem_flush_page(pool, &oid[i], index[i]);
> +			zcache_put_pool(pool);
> +		}
> +	}
> +	ASSERT_SENTINEL(zbpg, ZBPG);
> +	spin_lock(&zbpg->lock);
> +	zbud_free_raw_page(zbpg);
> +}
> +
> +/*
> + * Free nr pages.  This code is funky because we want to hold the locks
> + * protecting various lists for as short a time as possible, and in some
> + * circumstances the list may change asynchronously when the list lock is
> + * not held.  In some cases we also trylock not only to avoid waiting on a
> + * page in use by another cpu, but also to avoid potential deadlock due to
> + * lock inversion.
> + */
> +static void zbud_evict_pages(int nr)
> +{
> +	struct zbud_page *zbpg;
> +	int i;
> +
> +	/* first try freeing any pages on unused list */
> +retry_unused_list:
> +	spin_lock_bh(&zbpg_unused_list_spinlock);
> +	if (!list_empty(&zbpg_unused_list)) {
> +		/* can't walk list here, since it may change when unlocked */
> +		zbpg = list_first_entry(&zbpg_unused_list,
> +				struct zbud_page, bud_list);
> +		list_del_init(&zbpg->bud_list);
> +		zcache_zbpg_unused_list_count--;
> +		atomic_dec(&zcache_zbud_curr_raw_pages);
> +		spin_unlock_bh(&zbpg_unused_list_spinlock);
> +		zcache_free_page(zbpg);
> +		zcache_evicted_raw_pages++;
> +		if (--nr <= 0)
> +			goto out;
> +		goto retry_unused_list;
> +	}
> +	spin_unlock_bh(&zbpg_unused_list_spinlock);
> +
> +	/* now try freeing unbuddied pages, starting with least space avail */
> +	for (i = 0; i < MAX_CHUNK; i++) {
> +retry_unbud_list_i:
> +		spin_lock_bh(&zbud_budlists_spinlock);
> +		if (list_empty(&zbud_unbuddied[i].list)) {
> +			spin_unlock_bh(&zbud_budlists_spinlock);
> +			continue;
> +		}
> +		list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
> +			if (unlikely(!spin_trylock(&zbpg->lock)))
> +				continue;
> +			list_del_init(&zbpg->bud_list);
> +			zbud_unbuddied[i].count--;
> +			spin_unlock(&zbud_budlists_spinlock);
> +			zcache_evicted_unbuddied_pages++;
> +			/* want budlists unlocked when doing zbpg eviction */
> +			zbud_evict_zbpg(zbpg);
> +			local_bh_enable();
> +			if (--nr <= 0)
> +				goto out;
> +			goto retry_unbud_list_i;
> +		}
> +		spin_unlock_bh(&zbud_budlists_spinlock);
> +	}
> +
> +	/* as a last resort, free buddied pages */
> +retry_bud_list:
> +	spin_lock_bh(&zbud_budlists_spinlock);
> +	if (list_empty(&zbud_buddied_list)) {
> +		spin_unlock_bh(&zbud_budlists_spinlock);
> +		goto out;
> +	}
> +	list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
> +		if (unlikely(!spin_trylock(&zbpg->lock)))
> +			continue;
> +		list_del_init(&zbpg->bud_list);
> +		zcache_zbud_buddied_count--;
> +		spin_unlock(&zbud_budlists_spinlock);
> +		zcache_evicted_buddied_pages++;
> +		/* want budlists unlocked when doing zbpg eviction */
> +		zbud_evict_zbpg(zbpg);
> +		local_bh_enable();
> +		if (--nr <= 0)
> +			goto out;
> +		goto retry_bud_list;
> +	}
> +	spin_unlock_bh(&zbud_budlists_spinlock);
> +out:
> +	return;
> +}
> +
> +static void zbud_init(void)
> +{
> +	int i;
> +
> +	INIT_LIST_HEAD(&zbud_buddied_list);
> +	zcache_zbud_buddied_count = 0;
> +	for (i = 0; i < NCHUNKS; i++) {
> +		INIT_LIST_HEAD(&zbud_unbuddied[i].list);
> +		zbud_unbuddied[i].count = 0;
> +	}
> +}
> +
> +#ifdef CONFIG_SYSFS
> +/*
> + * These sysfs routines show a nice distribution of how many zbpg's are
> + * currently (and have ever been placed) in each unbuddied list.  It's fun
> + * to watch but can probably go away before final merge.
> + */
> +static int zbud_show_unbuddied_list_counts(char *buf)
> +{
> +	int i;
> +	char *p = buf;
> +
> +	for (i = 0; i < NCHUNKS - 1; i++)
> +		p += sprintf(p, "%u ", zbud_unbuddied[i].count);
> +	p += sprintf(p, "%d\n", zbud_unbuddied[i].count);
> +	return p - buf;
> +}
> +
> +static int zbud_show_cumul_chunk_counts(char *buf)
> +{
> +	unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
> +	unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
> +	unsigned long total_chunks_lte_42 = 0;
> +	char *p = buf;
> +
> +	for (i = 0; i < NCHUNKS; i++) {
> +		p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
> +		chunks += zbud_cumul_chunk_counts[i];
> +		total_chunks += zbud_cumul_chunk_counts[i];
> +		sum_total_chunks += i * zbud_cumul_chunk_counts[i];
> +		if (i == 21)
> +			total_chunks_lte_21 = total_chunks;
> +		if (i == 32)
> +			total_chunks_lte_32 = total_chunks;
> +		if (i == 42)
> +			total_chunks_lte_42 = total_chunks;
> +	}
> +	p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
> +		total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
> +		chunks == 0 ? 0 : sum_total_chunks / chunks);
> +	return p - buf;
> +}
> +#endif
> +
> +/**********
> + * This "zv" PAM implementation combines the TLSF-based xvMalloc
> + * with lzo1x compression to maximize the amount of data that can
> + * be packed into a physical page.
> + *
> + * Zv represents a PAM page with the index and object (plus a "size" value
> + * necessary for decompression) immediately preceding the compressed data.

Not seeing the 'size' value in the header? Is it embedded in the 
xv malloc data structure? If so, you might want to mention that.
> + */
> +
> +#define ZVH_SENTINEL  0x43214321
> +
> +struct zv_hdr {
> +	uint32_t pool_id;
> +	struct tmem_oid oid;
> +	uint32_t index;
> +	DECL_SENTINEL
> +};
> +
> +static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;

Justification? Should you use the PAGE_SIZE-sizeof(struct zv_hdr) instead?

> +
> +static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
> +				struct tmem_oid *oid, uint32_t index,
> +				void *cdata, unsigned clen)
> +{
> +	struct page *page;
> +	struct zv_hdr *zv = NULL;
> +	uint32_t offset;
> +	int ret;
> +
> +	BUG_ON(!irqs_disabled());
> +	ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
> +			&page, &offset, ZCACHE_GFP_MASK);
> +	if (unlikely(ret))
> +		goto out;
> +	zv = kmap_atomic(page, KM_USER0) + offset;
> +	zv->index = index;
> +	zv->oid = *oid;
> +	zv->pool_id = pool_id;
> +	SET_SENTINEL(zv, ZVH);
> +	memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
> +	kunmap_atomic(zv, KM_USER0);
> +out:
> +	return zv;
> +}
> +
> +static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
> +{
> +	unsigned long flags;
> +	struct page *page;
> +	uint32_t offset;
> +	uint16_t size;
> +
> +	ASSERT_SENTINEL(zv, ZVH);
> +	size = xv_get_object_size(zv) - sizeof(*zv);
> +	BUG_ON(size == 0 || size > zv_max_page_size);
> +	INVERT_SENTINEL(zv, ZVH);
> +	page = virt_to_page(zv);
> +	offset = (unsigned long)zv & ~PAGE_MASK;
> +	local_irq_save(flags);
> +	xv_free(xvpool, page, offset);
> +	local_irq_restore(flags);
> +}
> +
> +static void zv_decompress(struct page *page, struct zv_hdr *zv)
> +{
> +	size_t clen = PAGE_SIZE;
> +	char *to_va;
> +	unsigned size;
> +	int ret;
> +
> +	ASSERT_SENTINEL(zv, ZVH);
> +	size = xv_get_object_size(zv) - sizeof(*zv);
> +	BUG_ON(size == 0 || size > zv_max_page_size);
> +	to_va = kmap_atomic(page, KM_USER0);
> +	ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
> +					size, to_va, &clen);
> +	kunmap_atomic(to_va, KM_USER0);
> +	BUG_ON(ret != LZO_E_OK);
> +	BUG_ON(clen != PAGE_SIZE);
> +}
> +
> +/*
> + * zcache core code starts here
> + */
> +
> +/* useful stats not collected by cleancache or frontswap */
> +static unsigned long zcache_flush_total;
> +static unsigned long zcache_flush_found;
> +static unsigned long zcache_flobj_total;
> +static unsigned long zcache_flobj_found;
> +static unsigned long zcache_failed_eph_puts;
> +static unsigned long zcache_failed_pers_puts;
> +
> +#define MAX_POOLS_PER_CLIENT 16

Why sixteen?

> +
> +static struct {
> +	struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
> +	struct xv_pool *xvpool;
> +} zcache_client;
> +
> +/*
> + * Tmem operations assume the poolid implies the invoking client.
> + * Zcache only has one client (the kernel itself), so translate
> + * the poolid into the tmem_pool allocated for it.  A KVM version
> + * of zcache would have one client per guest and each client might
> + * have a poolid==N.
> + */
> +static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
> +{
> +	struct tmem_pool *pool = NULL;
> +
> +	if (poolid >= 0) {
> +		pool = zcache_client.tmem_pools[poolid];
> +		if (pool != NULL)
> +			atomic_inc(&pool->refcount);
> +	}
> +	return pool;
> +}
> +
> +static void zcache_put_pool(struct tmem_pool *pool)
> +{
> +	if (pool != NULL)
> +		atomic_dec(&pool->refcount);
> +}
> +
> +/* counters for debugging */
> +static unsigned long zcache_failed_get_free_pages;
> +static unsigned long zcache_failed_alloc;
> +static unsigned long zcache_put_to_flush;
> +static unsigned long zcache_aborted_preload;
> +static unsigned long zcache_aborted_shrink;
> +
> +/*
> + * Ensure that memory allocation requests in zcache don't result
> + * in direct reclaim requests via the shrinker, which would cause
> + * an infinite loop.  Maybe a GFP flag would be better?
> + */
> +static DEFINE_SPINLOCK(zcache_direct_reclaim_lock);
> +
> +/*
> + * for now, used named slabs so can easily track usage; later can

use
> + * either just use kmalloc, or perhaps add a slab-like allocator
> + * to more carefully manage total memory utilization
> + */
> +static struct kmem_cache *zcache_objnode_cache;
> +static struct kmem_cache *zcache_obj_cache;
> +static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
> +static unsigned long zcache_curr_obj_count_max;
> +static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
> +static unsigned long zcache_curr_objnode_count_max;
> +
> +/*
> + * to avoid memory allocation recursion (e.g. due to direct reclaim), we
> + * preload all necessary data structures so the hostops callbacks never
> + * actually do a malloc
> + */
> +struct zcache_preload {
> +	void *page;
> +	struct tmem_obj *obj;
> +	int nr;
> +	struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
> +};
> +static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
> +
> +static int zcache_do_preload(struct tmem_pool *pool)
> +{
> +	struct zcache_preload *kp;
> +	struct tmem_objnode *objnode;
> +	struct tmem_obj *obj;
> +	void *page;
> +	int ret = -ENOMEM;
> +
> +	if (unlikely(zcache_objnode_cache == NULL))
> +		goto out;
> +	if (unlikely(zcache_obj_cache == NULL))
> +		goto out;
> +	if (!spin_trylock(&zcache_direct_reclaim_lock)) {
> +		zcache_aborted_preload++;
> +		goto out;
> +	}
> +	preempt_disable();
> +	kp = &__get_cpu_var(zcache_preloads);
> +	while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
> +		preempt_enable_no_resched();
> +		objnode = kmem_cache_alloc(zcache_objnode_cache,
> +				ZCACHE_GFP_MASK);
> +		if (unlikely(objnode == NULL)) {
> +			zcache_failed_alloc++;
> +			goto unlock_out;
> +		}
> +		preempt_disable();
> +		kp = &__get_cpu_var(zcache_preloads);
> +		if (kp->nr < ARRAY_SIZE(kp->objnodes))
> +			kp->objnodes[kp->nr++] = objnode;
> +		else
> +			kmem_cache_free(zcache_objnode_cache, objnode);

That is it? No zcache_failed_alloc++; goto unlock_out?

> +	}
> +	preempt_enable_no_resched();
> +	obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
> +	if (unlikely(obj == NULL)) {
> +		zcache_failed_alloc++;

You don't want to do:
			kmem_cache_free(zcache_objnode_cache, objnode);
			kp->objnmodes[kp->nr--] = NULL;

?
> +		goto unlock_out;
> +	}
> +	page = (void *)__get_free_page(ZCACHE_GFP_MASK);
> +	if (unlikely(page == NULL)) {
> +		zcache_failed_get_free_pages++;
> +		goto unlock_out;

Same thing here. We don't want to cleanup?
> +	}
> +	preempt_disable();
> +	kp = &__get_cpu_var(zcache_preloads);
> +	if (kp->obj == NULL)
> +		kp->obj = obj;
> +	else
> +		kmem_cache_free(zcache_obj_cache, obj);
> +	if (kp->page == NULL)
> +		kp->page = page;
> +	else
> +		free_page((unsigned long)page);
> +	ret = 0;
> +unlock_out:
> +	spin_unlock(&zcache_direct_reclaim_lock);
> +out:
> +	return ret;
> +}
> +
> +static void *zcache_get_free_page(void)
> +{
> +	struct zcache_preload *kp;
> +	void *page;
> +
> +	kp = &__get_cpu_var(zcache_preloads);
> +	page = kp->page;
> +	BUG_ON(page == NULL);
> +	kp->page = NULL;
> +	return page;
> +}
> +
> +static void zcache_free_page(void *p)
> +{
> +	free_page((unsigned long)p);
> +}
> +
> +/*
> + * zcache implementation for tmem host ops
> + */
> +
> +static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
> +{
> +	struct tmem_objnode *objnode = NULL;
> +	unsigned long count;
> +	struct zcache_preload *kp;
> +
> +	kp = &__get_cpu_var(zcache_preloads);
> +	if (kp->nr <= 0)
> +		goto out;
> +	objnode = kp->objnodes[kp->nr - 1];
> +	BUG_ON(objnode == NULL);
> +	kp->objnodes[kp->nr - 1] = NULL;
> +	kp->nr--;
> +	count = atomic_inc_return(&zcache_curr_objnode_count);
> +	if (count > zcache_curr_objnode_count_max)
> +		zcache_curr_objnode_count_max = count;
> +out:
> +	return objnode;
> +}
> +
> +static void zcache_objnode_free(struct tmem_objnode *objnode,
> +					struct tmem_pool *pool)
> +{
> +	atomic_dec(&zcache_curr_objnode_count);
> +	BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
> +	kmem_cache_free(zcache_objnode_cache, objnode);
> +}
> +
> +static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
> +{
> +	struct tmem_obj *obj = NULL;
> +	unsigned long count;
> +	struct zcache_preload *kp;
> +
> +	kp = &__get_cpu_var(zcache_preloads);
> +	obj = kp->obj;
> +	BUG_ON(obj == NULL);
> +	kp->obj = NULL;
> +	count = atomic_inc_return(&zcache_curr_obj_count);
> +	if (count > zcache_curr_obj_count_max)
> +		zcache_curr_obj_count_max = count;
> +	return obj;
> +}
> +
> +static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
> +{
> +	atomic_dec(&zcache_curr_obj_count);
> +	BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
> +	kmem_cache_free(zcache_obj_cache, obj);

You remove it from the slab and don't populate the zcache_preloads.

What is the interaction between zcache_put_page and the zcache_obj_free?
Is the zcache_put_page done before the zcache_obj_free so that the
zcache_do_preload gets to be called to replenish the zcache_preloads?
> +}
> +
> +static struct tmem_hostops zcache_hostops = {
> +	.obj_alloc = zcache_obj_alloc,
> +	.obj_free = zcache_obj_free,
> +	.objnode_alloc = zcache_objnode_alloc,
> +	.objnode_free = zcache_objnode_free,
> +};
> +
> +/*
> + * zcache implementations for PAM page descriptor ops
> + */
> +
> +static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
> +static unsigned long zcache_curr_eph_pampd_count_max;
> +static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
> +static unsigned long zcache_curr_pers_pampd_count_max;
> +
> +/* forward reference */
> +static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
> +
> +static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
> +				 uint32_t index, struct page *page)
> +{
> +	void *pampd = NULL, *cdata;
> +	size_t clen;
> +	int ret;
> +	bool ephemeral = is_ephemeral(pool);
> +	unsigned long count;
> +
> +	if (ephemeral) {
> +		ret = zcache_compress(page, &cdata, &clen);
> +		if (ret == 0)
> +

Why the extra newline?
> +			goto out;
> +		if (clen == 0 || clen > zbud_max_buddy_size()) {
> +			zcache_compress_poor++;
> +			goto out;
> +		}
> +		pampd = (void *)zbud_create(pool->pool_id, oid, index,
> +						page, cdata, clen);
> +		if (pampd != NULL) {
> +			count = atomic_inc_return(&zcache_curr_eph_pampd_count);
> +			if (count > zcache_curr_eph_pampd_count_max)
> +				zcache_curr_eph_pampd_count_max = count;
> +		}
> +	} else {
> +		/*
> +		 * FIXME: This is all the "policy" there is for now.
> +		 * 3/4 totpages should allow ~37% of RAM to be filled with
> +		 * compressed frontswap pages
> +		 */
> +		if (atomic_read(&zcache_curr_pers_pampd_count) >
> +							3 * totalram_pages / 4)

Perhaps in the future a sysfs knob?

> +			goto out;
> +		ret = zcache_compress(page, &cdata, &clen);
> +		if (ret == 0)
> +			goto out;
> +		if (clen > zv_max_page_size) {
> +			zcache_compress_poor++;
> +			goto out;
> +		}
> +		pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
> +						oid, index, cdata, clen);
> +		if (pampd == NULL)
> +			goto out;
> +		count = atomic_inc_return(&zcache_curr_pers_pampd_count);
> +		if (count > zcache_curr_pers_pampd_count_max)
> +			zcache_curr_pers_pampd_count_max = count;
> +	}
> +out:
> +	return pampd;
> +}
> +
> +/*
> + * fill the pageframe corresponding to the struct page with the data
> + * from the passed pampd
> + */
> +static int zcache_pampd_get_data(struct page *page, void *pampd,
> +						struct tmem_pool *pool)
> +{
> +	int ret = 0;
> +
> +	if (is_ephemeral(pool))
> +		ret = zbud_decompress(page, pampd);
> +	else
> +		zv_decompress(page, pampd);
> +	return ret;
> +}
> +
> +/*
> + * free the pampd and remove it from any zcache lists
> + * pampd must no longer be pointed to from any tmem data structures!
> + */
> +static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
> +{
> +	if (is_ephemeral(pool)) {
> +		zbud_free_and_delist((struct zbud_hdr *)pampd);
> +		atomic_dec(&zcache_curr_eph_pampd_count);
> +		BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
> +	} else {
> +		zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
> +		atomic_dec(&zcache_curr_pers_pampd_count);
> +		BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
> +	}
> +}
> +
> +static struct tmem_pamops zcache_pamops = {
> +	.create = zcache_pampd_create,
> +	.get_data = zcache_pampd_get_data,
> +	.free = zcache_pampd_free,
> +};
> +
> +/*
> + * zcache compression/decompression and related per-cpu stuff
> + */
> +
> +#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
> +#define LZO_DSTMEM_PAGE_ORDER 1
> +static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
> +static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
> +
> +static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
> +{
> +	int ret = 0;
> +	unsigned char *dmem = __get_cpu_var(zcache_dstmem);
> +	unsigned char *wmem = __get_cpu_var(zcache_workmem);
> +	char *from_va;
> +
> +	BUG_ON(!irqs_disabled());
> +	if (unlikely(dmem == NULL || wmem == NULL))
> +		goto out;  /* no buffer, so can't compress */
> +	from_va = kmap_atomic(from, KM_USER0);
> +	mb();
> +	ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
> +	BUG_ON(ret != LZO_E_OK);
> +	*out_va = dmem;
> +	kunmap_atomic(from_va, KM_USER0);
> +	ret = 1;
> +out:
> +	return ret;
> +}
> +
> +
> +static int zcache_cpu_notifier(struct notifier_block *nb,
> +				unsigned long action, void *pcpu)
> +{
> +	int cpu = (long)pcpu;
> +	struct zcache_preload *kp;
> +
> +	switch (action) {
> +	case CPU_UP_PREPARE:
> +		per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
> +			GFP_KERNEL | __GFP_REPEAT,
> +			LZO_DSTMEM_PAGE_ORDER),
> +		per_cpu(zcache_workmem, cpu) =
> +			kzalloc(LZO1X_MEM_COMPRESS,
> +				GFP_KERNEL | __GFP_REPEAT);

Is it possible that those could fail? If so why should be done?

> +		break;
> +	case CPU_DEAD:
> +	case CPU_UP_CANCELED:
> +		free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
> +				LZO_DSTMEM_PAGE_ORDER);
> +		per_cpu(zcache_dstmem, cpu) = NULL;
> +		kfree(per_cpu(zcache_workmem, cpu));
> +		per_cpu(zcache_workmem, cpu) = NULL;
> +		kp = &per_cpu(zcache_preloads, cpu);
> +		while (kp->nr) {
> +			kmem_cache_free(zcache_objnode_cache,
> +					kp->objnodes[kp->nr - 1]);
> +			kp->objnodes[kp->nr - 1] = NULL;
> +			kp->nr--;
> +		}
> +		kmem_cache_free(zcache_obj_cache, kp->obj);
> +		free_page((unsigned long)kp->page);
> +		break;
> +	default:
> +		break;
> +	}
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block zcache_cpu_notifier_block = {
> +	.notifier_call = zcache_cpu_notifier
> +};
> +
> +#ifdef CONFIG_SYSFS
> +#define ZCACHE_SYSFS_RO(_name) \
> +	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
> +				struct kobj_attribute *attr, char *buf) \
> +	{ \
> +		return sprintf(buf, "%lu\n", zcache_##_name); \
> +	} \
> +	static struct kobj_attribute zcache_##_name##_attr = { \
> +		.attr = { .name = __stringify(_name), .mode = 0444 }, \
> +		.show = zcache_##_name##_show, \
> +	}
> +
> +#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
> +	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
> +				struct kobj_attribute *attr, char *buf) \
> +	{ \
> +	    return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
> +	} \
> +	static struct kobj_attribute zcache_##_name##_attr = { \
> +		.attr = { .name = __stringify(_name), .mode = 0444 }, \
> +		.show = zcache_##_name##_show, \
> +	}
> +
> +#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
> +	static ssize_t zcache_##_name##_show(struct kobject *kobj, \
> +				struct kobj_attribute *attr, char *buf) \
> +	{ \
> +	    return _func(buf); \
> +	} \
> +	static struct kobj_attribute zcache_##_name##_attr = { \
> +		.attr = { .name = __stringify(_name), .mode = 0444 }, \
> +		.show = zcache_##_name##_show, \
> +	}
> +
> +ZCACHE_SYSFS_RO(curr_obj_count_max);
> +ZCACHE_SYSFS_RO(curr_objnode_count_max);
> +ZCACHE_SYSFS_RO(flush_total);
> +ZCACHE_SYSFS_RO(flush_found);
> +ZCACHE_SYSFS_RO(flobj_total);
> +ZCACHE_SYSFS_RO(flobj_found);
> +ZCACHE_SYSFS_RO(failed_eph_puts);
> +ZCACHE_SYSFS_RO(failed_pers_puts);
> +ZCACHE_SYSFS_RO(zbud_curr_zbytes);
> +ZCACHE_SYSFS_RO(zbud_cumul_zpages);
> +ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
> +ZCACHE_SYSFS_RO(zbud_buddied_count);
> +ZCACHE_SYSFS_RO(zbpg_unused_list_count);
> +ZCACHE_SYSFS_RO(evicted_raw_pages);
> +ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
> +ZCACHE_SYSFS_RO(evicted_buddied_pages);
> +ZCACHE_SYSFS_RO(failed_get_free_pages);
> +ZCACHE_SYSFS_RO(failed_alloc);
> +ZCACHE_SYSFS_RO(put_to_flush);
> +ZCACHE_SYSFS_RO(aborted_preload);
> +ZCACHE_SYSFS_RO(aborted_shrink);
> +ZCACHE_SYSFS_RO(compress_poor);
> +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
> +ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
> +ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
> +ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
> +ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
> +			zbud_show_unbuddied_list_counts);
> +ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
> +			zbud_show_cumul_chunk_counts);
> +
> +static struct attribute *zcache_attrs[] = {
> +	&zcache_curr_obj_count_attr.attr,
> +	&zcache_curr_obj_count_max_attr.attr,
> +	&zcache_curr_objnode_count_attr.attr,
> +	&zcache_curr_objnode_count_max_attr.attr,
> +	&zcache_flush_total_attr.attr,
> +	&zcache_flobj_total_attr.attr,
> +	&zcache_flush_found_attr.attr,
> +	&zcache_flobj_found_attr.attr,
> +	&zcache_failed_eph_puts_attr.attr,
> +	&zcache_failed_pers_puts_attr.attr,
> +	&zcache_compress_poor_attr.attr,
> +	&zcache_zbud_curr_raw_pages_attr.attr,
> +	&zcache_zbud_curr_zpages_attr.attr,
> +	&zcache_zbud_curr_zbytes_attr.attr,
> +	&zcache_zbud_cumul_zpages_attr.attr,
> +	&zcache_zbud_cumul_zbytes_attr.attr,
> +	&zcache_zbud_buddied_count_attr.attr,
> +	&zcache_zbpg_unused_list_count_attr.attr,
> +	&zcache_evicted_raw_pages_attr.attr,
> +	&zcache_evicted_unbuddied_pages_attr.attr,
> +	&zcache_evicted_buddied_pages_attr.attr,
> +	&zcache_failed_get_free_pages_attr.attr,
> +	&zcache_failed_alloc_attr.attr,
> +	&zcache_put_to_flush_attr.attr,
> +	&zcache_aborted_preload_attr.attr,
> +	&zcache_aborted_shrink_attr.attr,
> +	&zcache_zbud_unbuddied_list_counts_attr.attr,
> +	&zcache_zbud_cumul_chunk_counts_attr.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group zcache_attr_group = {
> +	.attrs = zcache_attrs,
> +	.name = "zcache",
> +};
> +
> +#endif /* CONFIG_SYSFS */
> +/*
> + * When zcache is disabled ("frozen"), pools can be created and destroyed,
> + * but all puts (and thus all other operations that require memory allocation)
> + * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
> + * data consistency requires all puts while frozen to be converted into
> + * flushes.
> + */
> +static bool zcache_freeze;

There does not seem to be any toggle for this. So why have it?
> +
> +/*
> + * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
> + */
> +static int shrink_zcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
> +{
> +	int ret = -1;
> +
> +	if (nr >= 0) {
> +		if (!(gfp_mask & __GFP_FS))
> +			/* does this case really need to be skipped? */
> +			goto out;
> +		if (spin_trylock(&zcache_direct_reclaim_lock)) {
> +			zbud_evict_pages(nr);
> +			spin_unlock(&zcache_direct_reclaim_lock);
> +		} else
> +			zcache_aborted_shrink++;
> +	}
> +	ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
> +out:
> +	return ret;
> +}
> +
> +static struct shrinker zcache_shrinker = {
> +	.shrink = shrink_zcache_memory,
> +	.seeks = DEFAULT_SEEKS,
> +};
> +
> +/*
> + * zcache shims between cleancache/frontswap ops and tmem
> + */
> +
> +static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
> +				uint32_t index, struct page *page)
> +{
> +	struct tmem_pool *pool;
> +	int ret = -1;
> +
> +	BUG_ON(!irqs_disabled());
> +	pool = zcache_get_pool_by_id(pool_id);
> +	if (unlikely(pool == NULL))
> +		goto out;
> +	if (!zcache_freeze && zcache_do_preload(pool) == 0) {
> +		/* preload does preempt_disable on success */
> +		ret = tmem_put(pool, oidp, index, page);
> +		if (ret < 0) {
> +			if (is_ephemeral(pool))
> +				zcache_failed_eph_puts++;
> +			else
> +				zcache_failed_pers_puts++;
> +		}
> +		zcache_put_pool(pool);
> +		preempt_enable_no_resched();
> +	} else {
> +		zcache_put_to_flush++;
> +		if (atomic_read(&pool->obj_count) > 0)
> +			/* the put fails whether the flush succeeds or not */
> +			(void)tmem_flush_page(pool, oidp, index);
> +		zcache_put_pool(pool);
> +	}
> +out:
> +	return ret;
> +}
> +
> +static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
> +				uint32_t index, struct page *page)
> +{
> +	struct tmem_pool *pool;
> +	int ret = -1;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	pool = zcache_get_pool_by_id(pool_id);
> +	if (likely(pool != NULL)) {
> +		if (atomic_read(&pool->obj_count) > 0)
> +			ret = tmem_get(pool, oidp, index, page);
> +		zcache_put_pool(pool);
> +	}
> +	local_irq_restore(flags);
> +	return ret;
> +}
> +
> +static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
> +{
> +	struct tmem_pool *pool;
> +	int ret = -1;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	zcache_flush_total++;
> +	pool = zcache_get_pool_by_id(pool_id);
> +	if (likely(pool != NULL)) {
> +		if (atomic_read(&pool->obj_count) > 0)
> +			ret = tmem_flush_page(pool, oidp, index);
> +		zcache_put_pool(pool);
> +	}
> +	if (ret >= 0)
> +		zcache_flush_found++;
> +	local_irq_restore(flags);
> +	return ret;
> +}
> +
> +static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
> +{
> +	struct tmem_pool *pool;
> +	int ret = -1;
> +	unsigned long flags;
> +
> +	local_irq_save(flags);
> +	zcache_flobj_total++;
> +	pool = zcache_get_pool_by_id(pool_id);
> +	if (likely(pool != NULL)) {
> +		if (atomic_read(&pool->obj_count) > 0)
> +			ret = tmem_flush_object(pool, oidp);
> +		zcache_put_pool(pool);
> +	}
> +	if (ret >= 0)
> +		zcache_flobj_found++;
> +	local_irq_restore(flags);
> +	return ret;
> +}
> +
> +static int zcache_destroy_pool(int pool_id)
> +{
> +	struct tmem_pool *pool = NULL;
> +	int ret = -1;
> +
> +	if (pool_id < 0)
> +		goto out;
> +	pool = zcache_client.tmem_pools[pool_id];
> +	if (pool == NULL)
> +		goto out;
> +	zcache_client.tmem_pools[pool_id] = NULL;
> +	/* wait for pool activity on other cpus to quiesce */
> +	while (atomic_read(&pool->refcount) != 0)
> +		;
> +	local_bh_disable();
> +	ret = tmem_destroy_pool(pool);
> +	local_bh_enable();
> +	kfree(pool);
> +	pr_info("zcache: destroyed pool id=%d\n", pool_id);
> +out:
> +	return ret;
> +}
> +
> +static int zcache_new_pool(uint32_t flags)
> +{
> +	int poolid = -1;
> +	struct tmem_pool *pool;
> +
> +	pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
> +	if (pool == NULL) {
> +		pr_info("zcache: pool creation failed: out of memory\n");
> +		goto out;
> +	}
> +
> +	for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
> +		if (zcache_client.tmem_pools[poolid] == NULL)
> +			break;
> +	if (poolid >= MAX_POOLS_PER_CLIENT) {
> +		pr_info("zcache: pool creation failed: max exceeded\n");
> +		kfree(pool);
> +		poolid = -1;
> +		goto out;
> +	}
> +	atomic_set(&pool->refcount, 0);
> +	pool->client = &zcache_client;
> +	pool->pool_id = poolid;
> +	tmem_new_pool(pool, flags);
> +	zcache_client.tmem_pools[poolid] = pool;
> +	pr_info("zcache: created %s tmem pool, id=%d\n",
> +		flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
> +		poolid);
> +out:
> +	return poolid;
> +}
> +
> +/**********
> + * Two kernel functionalities currently can be layered on top of tmem.
> + * These are "cleancache" which is used as a second-chance cache for clean
> + * page cache pages; and "frontswap" which is used for swap pages
> + * to avoid writes to disk.  A generic "shim" is provided here for each
> + * to translate in-kernel semantics to zcache semantics.
> + */
> +
> +#ifdef CONFIG_CLEANCACHE
> +static void zcache_cleancache_put_page(int pool_id,
> +					struct cleancache_filekey key,
> +					pgoff_t index, struct page *page)
> +{
> +	u32 ind = (u32) index;
> +	struct tmem_oid oid = *(struct tmem_oid *)&key;
> +
> +	if (likely(ind == index))
> +		(void)zcache_put_page(pool_id, &oid, index, page);
> +}
> +
> +static int zcache_cleancache_get_page(int pool_id,
> +					struct cleancache_filekey key,
> +					pgoff_t index, struct page *page)
> +{
> +	u32 ind = (u32) index;
> +	struct tmem_oid oid = *(struct tmem_oid *)&key;
> +	int ret = -1;
> +
> +	if (likely(ind == index))
> +		ret = zcache_get_page(pool_id, &oid, index, page);
> +	return ret;
> +}
> +
> +static void zcache_cleancache_flush_page(int pool_id,
> +					struct cleancache_filekey key,
> +					pgoff_t index)
> +{
> +	u32 ind = (u32) index;
> +	struct tmem_oid oid = *(struct tmem_oid *)&key;
> +
> +	if (likely(ind == index))
> +		(void)zcache_flush_page(pool_id, &oid, ind);
> +}
> +
> +static void zcache_cleancache_flush_inode(int pool_id,
> +					struct cleancache_filekey key)
> +{
> +	struct tmem_oid oid = *(struct tmem_oid *)&key;
> +
> +	(void)zcache_flush_object(pool_id, &oid);
> +}
> +
> +static void zcache_cleancache_flush_fs(int pool_id)
> +{
> +	if (pool_id >= 0)
> +		(void)zcache_destroy_pool(pool_id);
> +}
> +
> +static int zcache_cleancache_init_fs(size_t pagesize)
> +{
> +	BUG_ON(sizeof(struct cleancache_filekey) !=
> +				sizeof(struct tmem_oid));
> +	BUG_ON(pagesize != PAGE_SIZE);
> +	return zcache_new_pool(0);
> +}
> +
> +static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
> +{
> +	/* shared pools are unsupported and map to private */
> +	BUG_ON(sizeof(struct cleancache_filekey) !=
> +				sizeof(struct tmem_oid));
> +	BUG_ON(pagesize != PAGE_SIZE);
> +	return zcache_new_pool(0);
> +}
> +
> +static struct cleancache_ops zcache_cleancache_ops = {
> +	.put_page = zcache_cleancache_put_page,
> +	.get_page = zcache_cleancache_get_page,
> +	.flush_page = zcache_cleancache_flush_page,
> +	.flush_inode = zcache_cleancache_flush_inode,
> +	.flush_fs = zcache_cleancache_flush_fs,
> +	.init_shared_fs = zcache_cleancache_init_shared_fs,
> +	.init_fs = zcache_cleancache_init_fs
> +};
> +
> +struct cleancache_ops zcache_cleancache_register_ops(void)
> +{
> +	struct cleancache_ops old_ops =
> +		cleancache_register_ops(&zcache_cleancache_ops);
> +
> +	return old_ops;
> +}
> +#endif
> +
> +#ifdef CONFIG_FRONTSWAP
> +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
> +static int zcache_frontswap_poolid = -1;
> +
> +/*
> + * Swizzling increases objects per swaptype, increasing tmem concurrency
> + * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS

Is this b/c of the hash size used in tmem pool??

> + */
> +#define SWIZ_BITS		4
> +#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
> +#define _oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
> +#define iswiz(_ind)		(_ind >> SWIZ_BITS)
> +
> +static inline struct tmem_oid oswiz(unsigned type, u32 ind)
> +{
> +	struct tmem_oid oid = { .oid = { 0 } };
> +	oid.oid[0] = _oswiz(type, ind);
> +	return oid;
> +}
> +
> +static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
> +				   struct page *page)
> +{
> +	u64 ind64 = (u64)offset;
> +	u32 ind = (u32)offset;
> +	struct tmem_oid oid = oswiz(type, ind);
> +	int ret = -1;
> +	unsigned long flags;
> +
> +	BUG_ON(!PageLocked(page));
> +	if (likely(ind64 == ind)) {
> +		local_irq_save(flags);
> +		ret = zcache_put_page(zcache_frontswap_poolid, &oid,
> +					iswiz(ind), page);

The third argument is the offset of a page shifted to the right by four.
Why? It ends up being used as 'index' value, not as an oid?


> +		local_irq_restore(flags);
> +	}
> +	return ret;
> +}
> +
> +/* returns 0 if the page was successfully gotten from frontswap, -1 if
> + * was not present (should never happen!) */
> +static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
> +				   struct page *page)
> +{
> +	u64 ind64 = (u64)offset;
> +	u32 ind = (u32)offset;
> +	struct tmem_oid oid = oswiz(type, ind);
> +	int ret = -1;
> +
> +	BUG_ON(!PageLocked(page));
> +	if (likely(ind64 == ind))
> +		ret = zcache_get_page(zcache_frontswap_poolid, &oid,
> +					iswiz(ind), page);
> +	return ret;
> +}
> +
> +/* flush a single page from frontswap */
> +static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
> +{
> +	u64 ind64 = (u64)offset;
> +	u32 ind = (u32)offset;
> +	struct tmem_oid oid = oswiz(type, ind);
> +
> +	if (likely(ind64 == ind))
> +		(void)zcache_flush_page(zcache_frontswap_poolid, &oid,
> +					iswiz(ind));
> +}
> +
> +/* flush all pages from the passed swaptype */
> +static void zcache_frontswap_flush_area(unsigned type)
> +{
> +	struct tmem_oid oid;
> +	int ind;
> +
> +	for (ind = SWIZ_MASK; ind >= 0; ind--) {
> +		oid = oswiz(type, ind);
> +		(void)zcache_flush_object(zcache_frontswap_poolid, &oid);
> +	}
> +}
> +
> +static void zcache_frontswap_init(unsigned ignored)
> +{
> +	/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
> +	if (zcache_frontswap_poolid < 0)
> +		zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
> +}
> +
> +static struct frontswap_ops zcache_frontswap_ops = {
> +	.put_page = zcache_frontswap_put_page,
> +	.get_page = zcache_frontswap_get_page,
> +	.flush_page = zcache_frontswap_flush_page,
> +	.flush_area = zcache_frontswap_flush_area,
> +	.init = zcache_frontswap_init
> +};
> +
> +struct frontswap_ops zcache_frontswap_register_ops(void)
> +{
> +	struct frontswap_ops old_ops =
> +		frontswap_register_ops(&zcache_frontswap_ops);
> +
> +	return old_ops;
> +}
> +#endif
> +
> +/*
> + * zcache initialization
> + * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
> + * NOTHING HAPPENS!

How does this work when this is compiled as a module?
> + */
> +
> +static int zcache_enabled;
> +
> +static int __init enable_zcache(char *s)
> +{
> +	zcache_enabled = 1;
> +	return 1;
> +}
> +__setup("zcache", enable_zcache);
> +
> +/* allow independent dynamic disabling of cleancache and frontswap */
> +
> +static int use_cleancache = 1;
> +
> +static int __init no_cleancache(char *s)
> +{
> +	use_cleancache = 0;
> +	return 1;
> +}
> +
> +__setup("nocleancache", no_cleancache);
> +
> +static int use_frontswap = 1;
> +
> +static int __init no_frontswap(char *s)
> +{
> +	use_frontswap = 0;
> +	return 1;
> +}
> +
> +__setup("nofrontswap", no_frontswap);
> +
> +static int __init zcache_init(void)
> +{
> +#ifdef CONFIG_SYSFS

Stick all of the sysfs code in a seperate file. Say similar
to zram_sysfs.c

> +	int ret = 0;
> +
> +	ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
> +	if (ret) {
> +		pr_err("zcache: can't create sysfs\n");
> +		goto out;
> +	}
> +#endif /* CONFIG_SYSFS */
> +#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
> +	if (zcache_enabled) {
> +		unsigned int cpu;
> +
> +		tmem_register_hostops(&zcache_hostops);
> +		tmem_register_pamops(&zcache_pamops);
> +		ret = register_cpu_notifier(&zcache_cpu_notifier_block);
> +		if (ret) {
> +			pr_err("zcache: can't register cpu notifier\n");

Shouldn't you de-register from tmem if you fail here?

> +			goto out;
> +		}
> +		for_each_online_cpu(cpu) {
> +			void *pcpu = (void *)(long)cpu;
> +			zcache_cpu_notifier(&zcache_cpu_notifier_block,
> +				CPU_UP_PREPARE, pcpu);
> +		}
Would it make sense to interchange the 'register_cpu_notifier' with this loop, so
that you first go over all the data structures for each CPU (and set the up). And
then once you are done, call the cpu notifier??

> +	}
> +	zcache_objnode_cache = kmem_cache_create("zcache_objnode",
> +				sizeof(struct tmem_objnode), 0, 0, NULL);
> +	zcache_obj_cache = kmem_cache_create("zcache_obj",
> +				sizeof(struct tmem_obj), 0, 0, NULL);

You ought to move those before do the CPU registration. The user could
be a sadistic one that during the modprobe bring the CPU down and the 
zcache_cpu_notifier_block would try to de-allocate objects from this
cache before this cache has been allocated.

> +#endif
> +#ifdef CONFIG_CLEANCACHE
> +	if (zcache_enabled && use_cleancache) {
> +		struct cleancache_ops old_ops;
> +
> +		zbud_init();
> +		register_shrinker(&zcache_shrinker);
> +		old_ops = zcache_cleancache_register_ops();
> +		pr_info("zcache: cleancache enabled using kernel "
> +			"transcendent memory and compression buddies\n");
> +		if (old_ops.init_fs != NULL)
> +			pr_warning("zcache: cleancache_ops overridden");
> +	}
> +#endif
> +#ifdef CONFIG_FRONTSWAP
> +	if (zcache_enabled && use_frontswap) {
> +		struct frontswap_ops old_ops;
> +
> +		zcache_client.xvpool = xv_create_pool();
> +		if (zcache_client.xvpool == NULL) {
> +			pr_err("zcache: can't create xvpool\n");
> +			goto out;
> +		}
> +		old_ops = zcache_frontswap_register_ops();
> +		pr_info("zcache: frontswap enabled using kernel "
> +			"transcendent memory and xvmalloc\n");
> +		if (old_ops.init != NULL)
> +			pr_warning("ktmem: frontswap_ops overridden");
> +	}
> +#endif
> +out:

If we fail, we should at least cleanup sysfs by unregistering it.

> +	return ret;
> +}
> +
> +module_init(zcache_init)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-15 16:53 ` Konrad Rzeszutek Wilk
@ 2011-02-15 17:25   ` Greg KH
  2011-02-15 18:18     ` Konrad Rzeszutek Wilk
  0 siblings, 1 reply; 13+ messages in thread
From: Greg KH @ 2011-02-15 17:25 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk
  Cc: Dan Magenheimer, chris.mason, akpm, torvalds, matthew,
	linux-kernel, linux-mm, ngupta, jeremy, kurt.hackel, npiggin,
	riel, mel, minchan.kim, kosaki.motohiro, sfr, wfg, tytso, viro,
	hughd, hannes

On Tue, Feb 15, 2011 at 11:53:53AM -0500, Konrad Rzeszutek Wilk wrote:
> On Sun, Feb 06, 2011 at 07:26:08PM -0800, Dan Magenheimer wrote:
> > [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
> 
> Hey Dan,
> 
> I did a simple review of just reading the code and trying to grok it.
> 
> Greg,
> 
> How does the review work in staging tree? Would you just take
> new patches from Dan based on my review and he should stick
> 'Reviewed-by: Konrad...' or .. ?

I would, if I hadn't already committed these patches, so it'e a bit too
late for adding that, sorry.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
  2011-02-15 17:25   ` Greg KH
@ 2011-02-15 18:18     ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 13+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-02-15 18:18 UTC (permalink / raw)
  To: Greg KH
  Cc: Dan Magenheimer, chris.mason, akpm, torvalds, matthew,
	linux-kernel, linux-mm, ngupta, jeremy, kurt.hackel, npiggin,
	riel, mel, minchan.kim, kosaki.motohiro, sfr, wfg, tytso, viro,
	hughd, hannes

On Tue, Feb 15, 2011 at 09:25:48AM -0800, Greg KH wrote:
> On Tue, Feb 15, 2011 at 11:53:53AM -0500, Konrad Rzeszutek Wilk wrote:
> > On Sun, Feb 06, 2011 at 07:26:08PM -0800, Dan Magenheimer wrote:
> > > [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services
> > 
> > Hey Dan,
> > 
> > I did a simple review of just reading the code and trying to grok it.
> > 
> > Greg,
> > 
> > How does the review work in staging tree? Would you just take
> > new patches from Dan based on my review and he should stick
> > 'Reviewed-by: Konrad...' or .. ?
> 
> I would, if I hadn't already committed these patches, so it'e a bit too
> late for adding that, sorry.

That is OK. Thanks.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2011-02-15 18:20 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-02-07  3:26 [PATCH V2 2/3] drivers/staging: zcache: host services and PAM services Dan Magenheimer
2011-02-08 22:48 ` Minchan Kim
2011-02-08 23:27   ` Dan Magenheimer
2011-02-08 23:56     ` Minchan Kim
2011-02-09 16:39       ` Dan Magenheimer
2011-02-09 17:36         ` Nitin Gupta
2011-02-09 23:46           ` Minchan Kim
2011-02-10  1:17             ` Nitin Gupta
2011-02-09 23:57         ` Minchan Kim
2011-02-09 23:58           ` Minchan Kim
2011-02-15 16:53 ` Konrad Rzeszutek Wilk
2011-02-15 17:25   ` Greg KH
2011-02-15 18:18     ` Konrad Rzeszutek Wilk

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).