diff options
Diffstat (limited to 'drivers/staging/zcache/ramster/ramster.c')
-rw-r--r-- | drivers/staging/zcache/ramster/ramster.c | 985 |
1 files changed, 985 insertions, 0 deletions
diff --git a/drivers/staging/zcache/ramster/ramster.c b/drivers/staging/zcache/ramster/ramster.c new file mode 100644 index 000000000000..bf96a1cbf7c1 --- /dev/null +++ b/drivers/staging/zcache/ramster/ramster.c @@ -0,0 +1,985 @@ +/* + * ramster.c + * + * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp. + * + * RAMster implements peer-to-peer transcendent memory, allowing a "cluster" of + * kernels to dynamically pool their RAM so that a RAM-hungry workload on one + * machine can temporarily and transparently utilize RAM on another machine + * which is presumably idle or running a non-RAM-hungry workload. + * + * RAMster combines a clustering and messaging foundation based on the ocfs2 + * cluster layer with the in-kernel compression implementation of zcache, and + * adds code to glue them together. When a page is "put" to RAMster, it is + * compressed and stored locally. Periodically, a thread will "remotify" these + * pages by sending them via messages to a remote machine. When the page is + * later needed as indicated by a page fault, a "get" is issued. If the data + * is local, it is uncompressed and the fault is resolved. If the data is + * remote, a message is sent to fetch the data and the faulting thread sleeps; + * when the data arrives, the thread awakens, the data is decompressed and + * the fault is resolved. + + * As of V5, clusters up to eight nodes are supported; each node can remotify + * pages to one specified node, so clusters can be configured as clients to + * a "memory server". Some simple policy is in place that will need to be + * refined over time. Larger clusters and fault-resistant protocols can also + * be added over time. + */ + +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/highmem.h> +#include <linux/list.h> +#include <linux/lzo.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/frontswap.h> +#include "../tmem.h" +#include "../zcache.h" +#include "../zbud.h" +#include "ramster.h" +#include "ramster_nodemanager.h" +#include "tcp.h" + +#define RAMSTER_TESTING + +#ifndef CONFIG_SYSFS +#error "ramster needs sysfs to define cluster nodes to use" +#endif + +static bool use_cleancache __read_mostly; +static bool use_frontswap __read_mostly; +static bool use_frontswap_exclusive_gets __read_mostly; + +/* These must be sysfs not debugfs as they are checked/used by userland!! */ +static unsigned long ramster_interface_revision __read_mostly = + R2NM_API_VERSION; /* interface revision must match userspace! */ +static unsigned long ramster_pers_remotify_enable __read_mostly; +static unsigned long ramster_eph_remotify_enable __read_mostly; +static atomic_t ramster_remote_pers_pages = ATOMIC_INIT(0); +#define MANUAL_NODES 8 +static bool ramster_nodes_manual_up[MANUAL_NODES] __read_mostly; +static int ramster_remote_target_nodenum __read_mostly = -1; + +/* these counters are made available via debugfs */ +static long ramster_flnodes; +static atomic_t ramster_flnodes_atomic = ATOMIC_INIT(0); +static unsigned long ramster_flnodes_max; +static ssize_t ramster_foreign_eph_pages; +static atomic_t ramster_foreign_eph_pages_atomic = ATOMIC_INIT(0); +static ssize_t ramster_foreign_eph_pages_max; +static ssize_t ramster_foreign_pers_pages; +static atomic_t ramster_foreign_pers_pages_atomic = ATOMIC_INIT(0); +static ssize_t ramster_foreign_pers_pages_max; +static ssize_t ramster_eph_pages_remoted; +static ssize_t ramster_pers_pages_remoted; +static ssize_t ramster_eph_pages_remote_failed; +static ssize_t ramster_pers_pages_remote_failed; +static ssize_t ramster_remote_eph_pages_succ_get; +static ssize_t ramster_remote_pers_pages_succ_get; +static ssize_t ramster_remote_eph_pages_unsucc_get; +static ssize_t ramster_remote_pers_pages_unsucc_get; +static ssize_t ramster_pers_pages_remote_nomem; +static ssize_t ramster_remote_objects_flushed; +static ssize_t ramster_remote_object_flushes_failed; +static ssize_t ramster_remote_pages_flushed; +static ssize_t ramster_remote_page_flushes_failed; +/* FIXME frontswap selfshrinking knobs in debugfs? */ + +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> +#define zdfs debugfs_create_size_t +#define zdfs64 debugfs_create_u64 +static int __init ramster_debugfs_init(void) +{ + struct dentry *root = debugfs_create_dir("ramster", NULL); + if (root == NULL) + return -ENXIO; + + zdfs("eph_pages_remoted", S_IRUGO, root, &ramster_eph_pages_remoted); + zdfs("pers_pages_remoted", S_IRUGO, root, &ramster_pers_pages_remoted); + zdfs("eph_pages_remote_failed", S_IRUGO, root, + &ramster_eph_pages_remote_failed); + zdfs("pers_pages_remote_failed", S_IRUGO, root, + &ramster_pers_pages_remote_failed); + zdfs("remote_eph_pages_succ_get", S_IRUGO, root, + &ramster_remote_eph_pages_succ_get); + zdfs("remote_pers_pages_succ_get", S_IRUGO, root, + &ramster_remote_pers_pages_succ_get); + zdfs("remote_eph_pages_unsucc_get", S_IRUGO, root, + &ramster_remote_eph_pages_unsucc_get); + zdfs("remote_pers_pages_unsucc_get", S_IRUGO, root, + &ramster_remote_pers_pages_unsucc_get); + zdfs("pers_pages_remote_nomem", S_IRUGO, root, + &ramster_pers_pages_remote_nomem); + zdfs("remote_objects_flushed", S_IRUGO, root, + &ramster_remote_objects_flushed); + zdfs("remote_pages_flushed", S_IRUGO, root, + &ramster_remote_pages_flushed); + zdfs("remote_object_flushes_failed", S_IRUGO, root, + &ramster_remote_object_flushes_failed); + zdfs("remote_page_flushes_failed", S_IRUGO, root, + &ramster_remote_page_flushes_failed); + zdfs("foreign_eph_pages", S_IRUGO, root, + &ramster_foreign_eph_pages); + zdfs("foreign_eph_pages_max", S_IRUGO, root, + &ramster_foreign_eph_pages_max); + zdfs("foreign_pers_pages", S_IRUGO, root, + &ramster_foreign_pers_pages); + zdfs("foreign_pers_pages_max", S_IRUGO, root, + &ramster_foreign_pers_pages_max); + return 0; +} +#undef zdebugfs +#undef zdfs64 +#endif + +static LIST_HEAD(ramster_rem_op_list); +static DEFINE_SPINLOCK(ramster_rem_op_list_lock); +static DEFINE_PER_CPU(struct ramster_preload, ramster_preloads); + +static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem1); +static DEFINE_PER_CPU(unsigned char *, ramster_remoteputmem2); + +static struct kmem_cache *ramster_flnode_cache __read_mostly; + +static struct flushlist_node *ramster_flnode_alloc(struct tmem_pool *pool) +{ + struct flushlist_node *flnode = NULL; + struct ramster_preload *kp; + + kp = &__get_cpu_var(ramster_preloads); + flnode = kp->flnode; + BUG_ON(flnode == NULL); + kp->flnode = NULL; + ramster_flnodes = atomic_inc_return(&ramster_flnodes_atomic); + if (ramster_flnodes > ramster_flnodes_max) + ramster_flnodes_max = ramster_flnodes; + return flnode; +} + +/* the "flush list" asynchronously collects pages to remotely flush */ +#define FLUSH_ENTIRE_OBJECT ((uint32_t)-1) +static void ramster_flnode_free(struct flushlist_node *flnode, + struct tmem_pool *pool) +{ + int flnodes; + + flnodes = atomic_dec_return(&ramster_flnodes_atomic); + BUG_ON(flnodes < 0); + kmem_cache_free(ramster_flnode_cache, flnode); +} + +int ramster_do_preload_flnode(struct tmem_pool *pool) +{ + struct ramster_preload *kp; + struct flushlist_node *flnode; + int ret = -ENOMEM; + + BUG_ON(!irqs_disabled()); + if (unlikely(ramster_flnode_cache == NULL)) + BUG(); + kp = &__get_cpu_var(ramster_preloads); + flnode = kmem_cache_alloc(ramster_flnode_cache, GFP_ATOMIC); + if (unlikely(flnode == NULL) && kp->flnode == NULL) + BUG(); /* FIXME handle more gracefully, but how??? */ + else if (kp->flnode == NULL) + kp->flnode = flnode; + else + kmem_cache_free(ramster_flnode_cache, flnode); + return ret; +} + +/* + * Called by the message handler after a (still compressed) page has been + * fetched from the remote machine in response to an "is_remote" tmem_get + * or persistent tmem_localify. For a tmem_get, "extra" is the address of + * the page that is to be filled to successfully resolve the tmem_get; for + * a (persistent) tmem_localify, "extra" is NULL (as the data is placed only + * in the local zcache). "data" points to "size" bytes of (compressed) data + * passed in the message. In the case of a persistent remote get, if + * pre-allocation was successful (see ramster_repatriate_preload), the page + * is placed into both local zcache and at "extra". + */ +int ramster_localify(int pool_id, struct tmem_oid *oidp, uint32_t index, + char *data, unsigned int size, void *extra) +{ + int ret = -ENOENT; + unsigned long flags; + struct tmem_pool *pool; + bool eph, delete = false; + void *pampd, *saved_hb; + struct tmem_obj *obj; + + pool = zcache_get_pool_by_id(LOCAL_CLIENT, pool_id); + if (unlikely(pool == NULL)) + /* pool doesn't exist anymore */ + goto out; + eph = is_ephemeral(pool); + local_irq_save(flags); /* FIXME: maybe only disable softirqs? */ + pampd = tmem_localify_get_pampd(pool, oidp, index, &obj, &saved_hb); + if (pampd == NULL) { + /* hmmm... must have been a flush while waiting */ +#ifdef RAMSTER_TESTING + pr_err("UNTESTED pampd==NULL in ramster_localify\n"); +#endif + if (eph) + ramster_remote_eph_pages_unsucc_get++; + else + ramster_remote_pers_pages_unsucc_get++; + obj = NULL; + goto finish; + } else if (unlikely(!pampd_is_remote(pampd))) { + /* hmmm... must have been a dup put while waiting */ +#ifdef RAMSTER_TESTING + pr_err("UNTESTED dup while waiting in ramster_localify\n"); +#endif + if (eph) + ramster_remote_eph_pages_unsucc_get++; + else + ramster_remote_pers_pages_unsucc_get++; + obj = NULL; + pampd = NULL; + ret = -EEXIST; + goto finish; + } else if (size == 0) { + /* no remote data, delete the local is_remote pampd */ + pampd = NULL; + if (eph) + ramster_remote_eph_pages_unsucc_get++; + else + BUG(); + delete = true; + goto finish; + } + if (pampd_is_intransit(pampd)) { + /* + * a pampd is marked intransit if it is remote and space has + * been allocated for it locally (note, only happens for + * persistent pages, in which case the remote copy is freed) + */ + BUG_ON(eph); + pampd = pampd_mask_intransit_and_remote(pampd); + zbud_copy_to_zbud(pampd, data, size); + } else { + /* + * setting pampd to NULL tells tmem_localify_finish to leave + * pampd alone... meaning it is left pointing to the + * remote copy + */ + pampd = NULL; + obj = NULL; + } + /* + * but in all cases, we decompress direct-to-memory to complete + * the remotify and return success + */ + BUG_ON(extra == NULL); + zcache_decompress_to_page(data, size, (struct page *)extra); + if (eph) + ramster_remote_eph_pages_succ_get++; + else + ramster_remote_pers_pages_succ_get++; + ret = 0; +finish: + tmem_localify_finish(obj, index, pampd, saved_hb, delete); + zcache_put_pool(pool); + local_irq_restore(flags); +out: + return ret; +} + +void ramster_pampd_new_obj(struct tmem_obj *obj) +{ + obj->extra = NULL; +} + +void ramster_pampd_free_obj(struct tmem_pool *pool, struct tmem_obj *obj, + bool pool_destroy) +{ + struct flushlist_node *flnode; + + BUG_ON(preemptible()); + if (obj->extra == NULL) + return; + if (pool_destroy && is_ephemeral(pool)) + /* FIXME don't bother with remote eph data for now */ + return; + BUG_ON(!pampd_is_remote(obj->extra)); + flnode = ramster_flnode_alloc(pool); + flnode->xh.client_id = pampd_remote_node(obj->extra); + flnode->xh.pool_id = pool->pool_id; + flnode->xh.oid = obj->oid; + flnode->xh.index = FLUSH_ENTIRE_OBJECT; + flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_OBJ; + spin_lock(&ramster_rem_op_list_lock); + list_add(&flnode->rem_op.list, &ramster_rem_op_list); + spin_unlock(&ramster_rem_op_list_lock); +} + +/* + * Called on a remote persistent tmem_get to attempt to preallocate + * local storage for the data contained in the remote persistent page. + * If successfully preallocated, returns the pampd, marked as remote and + * in_transit. Else returns NULL. Note that the appropriate tmem data + * structure must be locked. + */ +void *ramster_pampd_repatriate_preload(void *pampd, struct tmem_pool *pool, + struct tmem_oid *oidp, uint32_t index, + bool *intransit) +{ + int clen = pampd_remote_size(pampd), c; + void *ret_pampd = NULL; + unsigned long flags; + struct tmem_handle th; + + BUG_ON(!pampd_is_remote(pampd)); + BUG_ON(is_ephemeral(pool)); + if (use_frontswap_exclusive_gets) + /* don't need local storage */ + goto out; + if (pampd_is_intransit(pampd)) { + /* + * to avoid multiple allocations (and maybe a memory leak) + * don't preallocate if already in the process of being + * repatriated + */ + *intransit = true; + goto out; + } + *intransit = false; + local_irq_save(flags); + th.client_id = pampd_remote_node(pampd); + th.pool_id = pool->pool_id; + th.oid = *oidp; + th.index = index; + ret_pampd = zcache_pampd_create(NULL, clen, true, false, &th); + if (ret_pampd != NULL) { + /* + * a pampd is marked intransit if it is remote and space has + * been allocated for it locally (note, only happens for + * persistent pages, in which case the remote copy is freed) + */ + ret_pampd = pampd_mark_intransit(ret_pampd); + c = atomic_dec_return(&ramster_remote_pers_pages); + WARN_ON_ONCE(c < 0); + } else { + ramster_pers_pages_remote_nomem++; + } + local_irq_restore(flags); +out: + return ret_pampd; +} + +/* + * Called on a remote tmem_get to invoke a message to fetch the page. + * Might sleep so no tmem locks can be held. "extra" is passed + * all the way through the round-trip messaging to ramster_localify. + */ +int ramster_pampd_repatriate(void *fake_pampd, void *real_pampd, + struct tmem_pool *pool, + struct tmem_oid *oid, uint32_t index, + bool free, void *extra) +{ + struct tmem_xhandle xh; + int ret; + + if (pampd_is_intransit(real_pampd)) + /* have local space pre-reserved, so free remote copy */ + free = true; + xh = tmem_xhandle_fill(LOCAL_CLIENT, pool, oid, index); + /* unreliable request/response for now */ + ret = r2net_remote_async_get(&xh, free, + pampd_remote_node(fake_pampd), + pampd_remote_size(fake_pampd), + pampd_remote_cksum(fake_pampd), + extra); + return ret; +} + +bool ramster_pampd_is_remote(void *pampd) +{ + return pampd_is_remote(pampd); +} + +int ramster_pampd_replace_in_obj(void *new_pampd, struct tmem_obj *obj) +{ + int ret = -1; + + if (new_pampd != NULL) { + if (obj->extra == NULL) + obj->extra = new_pampd; + /* enforce that all remote pages in an object reside + * in the same node! */ + else if (pampd_remote_node(new_pampd) != + pampd_remote_node((void *)(obj->extra))) + BUG(); + ret = 0; + } + return ret; +} + +void *ramster_pampd_free(void *pampd, struct tmem_pool *pool, + struct tmem_oid *oid, uint32_t index, bool acct) +{ + bool eph = is_ephemeral(pool); + void *local_pampd = NULL; + int c; + + BUG_ON(preemptible()); + BUG_ON(!pampd_is_remote(pampd)); + WARN_ON(acct == false); + if (oid == NULL) { + /* + * a NULL oid means to ignore this pampd free + * as the remote freeing will be handled elsewhere + */ + } else if (eph) { + /* FIXME remote flush optional but probably good idea */ + } else if (pampd_is_intransit(pampd)) { + /* did a pers remote get_and_free, so just free local */ + local_pampd = pampd_mask_intransit_and_remote(pampd); + } else { + struct flushlist_node *flnode = + ramster_flnode_alloc(pool); + + flnode->xh.client_id = pampd_remote_node(pampd); + flnode->xh.pool_id = pool->pool_id; + flnode->xh.oid = *oid; + flnode->xh.index = index; + flnode->rem_op.op = RAMSTER_REMOTIFY_FLUSH_PAGE; + spin_lock(&ramster_rem_op_list_lock); + list_add(&flnode->rem_op.list, &ramster_rem_op_list); + spin_unlock(&ramster_rem_op_list_lock); + c = atomic_dec_return(&ramster_remote_pers_pages); + WARN_ON_ONCE(c < 0); + } + return local_pampd; +} + +void ramster_count_foreign_pages(bool eph, int count) +{ + int c; + + BUG_ON(count != 1 && count != -1); + if (eph) { + if (count > 0) { + c = atomic_inc_return( + &ramster_foreign_eph_pages_atomic); + if (c > ramster_foreign_eph_pages_max) + ramster_foreign_eph_pages_max = c; + } else { + c = atomic_dec_return(&ramster_foreign_eph_pages_atomic); + WARN_ON_ONCE(c < 0); + } + ramster_foreign_eph_pages = c; + } else { + if (count > 0) { + c = atomic_inc_return( + &ramster_foreign_pers_pages_atomic); + if (c > ramster_foreign_pers_pages_max) + ramster_foreign_pers_pages_max = c; + } else { + c = atomic_dec_return( + &ramster_foreign_pers_pages_atomic); + WARN_ON_ONCE(c < 0); + } + ramster_foreign_pers_pages = c; + } +} + +/* + * For now, just push over a few pages every few seconds to + * ensure that it basically works + */ +static struct workqueue_struct *ramster_remotify_workqueue; +static void ramster_remotify_process(struct work_struct *work); +static DECLARE_DELAYED_WORK(ramster_remotify_worker, + ramster_remotify_process); + +static void ramster_remotify_queue_delayed_work(unsigned long delay) +{ + if (!queue_delayed_work(ramster_remotify_workqueue, + &ramster_remotify_worker, delay)) + pr_err("ramster_remotify: bad workqueue\n"); +} + +static void ramster_remote_flush_page(struct flushlist_node *flnode) +{ + struct tmem_xhandle *xh; + int remotenode, ret; + + preempt_disable(); + xh = &flnode->xh; + remotenode = flnode->xh.client_id; + ret = r2net_remote_flush(xh, remotenode); + if (ret >= 0) + ramster_remote_pages_flushed++; + else + ramster_remote_page_flushes_failed++; + preempt_enable_no_resched(); + ramster_flnode_free(flnode, NULL); +} + +static void ramster_remote_flush_object(struct flushlist_node *flnode) +{ + struct tmem_xhandle *xh; + int remotenode, ret; + + preempt_disable(); + xh = &flnode->xh; + remotenode = flnode->xh.client_id; + ret = r2net_remote_flush_object(xh, remotenode); + if (ret >= 0) + ramster_remote_objects_flushed++; + else + ramster_remote_object_flushes_failed++; + preempt_enable_no_resched(); + ramster_flnode_free(flnode, NULL); +} + +int ramster_remotify_pageframe(bool eph) +{ + struct tmem_xhandle xh; + unsigned int size; + int remotenode, ret, zbuds; + struct tmem_pool *pool; + unsigned long flags; + unsigned char cksum; + char *p; + int i, j; + unsigned char *tmpmem[2]; + struct tmem_handle th[2]; + unsigned int zsize[2]; + + tmpmem[0] = __get_cpu_var(ramster_remoteputmem1); + tmpmem[1] = __get_cpu_var(ramster_remoteputmem2); + local_bh_disable(); + zbuds = zbud_make_zombie_lru(&th[0], &tmpmem[0], &zsize[0], eph); + /* now OK to release lock set in caller */ + local_bh_enable(); + if (zbuds == 0) + goto out; + BUG_ON(zbuds > 2); + for (i = 0; i < zbuds; i++) { + xh.client_id = th[i].client_id; + xh.pool_id = th[i].pool_id; + xh.oid = th[i].oid; + xh.index = th[i].index; + size = zsize[i]; + BUG_ON(size == 0 || size > zbud_max_buddy_size()); + for (p = tmpmem[i], cksum = 0, j = 0; j < size; j++) + cksum += *p++; + ret = r2net_remote_put(&xh, tmpmem[i], size, eph, &remotenode); + if (ret != 0) { + /* + * This is some form of a memory leak... if the remote put + * fails, there will never be another attempt to remotify + * this page. But since we've dropped the zv pointer, + * the page may have been freed or the data replaced + * so we can't just "put it back" in the remote op list. + * Even if we could, not sure where to put it in the list + * because there may be flushes that must be strictly + * ordered vs the put. So leave this as a FIXME for now. + * But count them so we know if it becomes a problem. + */ + if (eph) + ramster_eph_pages_remote_failed++; + else + ramster_pers_pages_remote_failed++; + break; + } else { + if (!eph) + atomic_inc(&ramster_remote_pers_pages); + } + if (eph) + ramster_eph_pages_remoted++; + else + ramster_pers_pages_remoted++; + /* + * data was successfully remoted so change the local version to + * point to the remote node where it landed + */ + local_bh_disable(); + pool = zcache_get_pool_by_id(LOCAL_CLIENT, xh.pool_id); + local_irq_save(flags); + (void)tmem_replace(pool, &xh.oid, xh.index, + pampd_make_remote(remotenode, size, cksum)); + local_irq_restore(flags); + zcache_put_pool(pool); + local_bh_enable(); + } +out: + return zbuds; +} + +static void zcache_do_remotify_flushes(void) +{ + struct ramster_remotify_hdr *rem_op; + union remotify_list_node *u; + + while (1) { + spin_lock(&ramster_rem_op_list_lock); + if (list_empty(&ramster_rem_op_list)) { + spin_unlock(&ramster_rem_op_list_lock); + goto out; + } + rem_op = list_first_entry(&ramster_rem_op_list, + struct ramster_remotify_hdr, list); + list_del_init(&rem_op->list); + spin_unlock(&ramster_rem_op_list_lock); + u = (union remotify_list_node *)rem_op; + switch (rem_op->op) { + case RAMSTER_REMOTIFY_FLUSH_PAGE: + ramster_remote_flush_page((struct flushlist_node *)u); + break; + case RAMSTER_REMOTIFY_FLUSH_OBJ: + ramster_remote_flush_object((struct flushlist_node *)u); + break; + default: + BUG(); + } + } +out: + return; +} + +static void ramster_remotify_process(struct work_struct *work) +{ + static bool remotify_in_progress; + int i; + + BUG_ON(irqs_disabled()); + if (remotify_in_progress) + goto requeue; + if (ramster_remote_target_nodenum == -1) + goto requeue; + remotify_in_progress = true; + if (use_cleancache && ramster_eph_remotify_enable) { + for (i = 0; i < 100; i++) { + zcache_do_remotify_flushes(); + (void)ramster_remotify_pageframe(true); + } + } + if (use_frontswap && ramster_pers_remotify_enable) { + for (i = 0; i < 100; i++) { + zcache_do_remotify_flushes(); + (void)ramster_remotify_pageframe(false); + } + } + remotify_in_progress = false; +requeue: + ramster_remotify_queue_delayed_work(HZ); +} + +void __init ramster_remotify_init(void) +{ + unsigned long n = 60UL; + ramster_remotify_workqueue = + create_singlethread_workqueue("ramster_remotify"); + ramster_remotify_queue_delayed_work(n * HZ); +} + +static ssize_t ramster_manual_node_up_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i; + char *p = buf; + for (i = 0; i < MANUAL_NODES; i++) + if (ramster_nodes_manual_up[i]) + p += sprintf(p, "%d ", i); + p += sprintf(p, "\n"); + return p - buf; +} + +static ssize_t ramster_manual_node_up_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long node_num; + + err = kstrtoul(buf, 10, &node_num); + if (err) { + pr_err("ramster: bad strtoul?\n"); + return -EINVAL; + } + if (node_num >= MANUAL_NODES) { + pr_err("ramster: bad node_num=%lu?\n", node_num); + return -EINVAL; + } + if (ramster_nodes_manual_up[node_num]) { + pr_err("ramster: node %d already up, ignoring\n", + (int)node_num); + } else { + ramster_nodes_manual_up[node_num] = true; + r2net_hb_node_up_manual((int)node_num); + } + return count; +} + +static struct kobj_attribute ramster_manual_node_up_attr = { + .attr = { .name = "manual_node_up", .mode = 0644 }, + .show = ramster_manual_node_up_show, + .store = ramster_manual_node_up_store, +}; + +static ssize_t ramster_remote_target_nodenum_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + if (ramster_remote_target_nodenum == -1UL) + return sprintf(buf, "unset\n"); + else + return sprintf(buf, "%d\n", ramster_remote_target_nodenum); +} + +static ssize_t ramster_remote_target_nodenum_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long node_num; + + err = kstrtoul(buf, 10, &node_num); + if (err) { + pr_err("ramster: bad strtoul?\n"); + return -EINVAL; + } else if (node_num == -1UL) { + pr_err("ramster: disabling all remotification, " + "data may still reside on remote nodes however\n"); + return -EINVAL; + } else if (node_num >= MANUAL_NODES) { + pr_err("ramster: bad node_num=%lu?\n", node_num); + return -EINVAL; + } else if (!ramster_nodes_manual_up[node_num]) { + pr_err("ramster: node %d not up, ignoring setting " + "of remotification target\n", (int)node_num); + } else if (r2net_remote_target_node_set((int)node_num) >= 0) { + pr_info("ramster: node %d set as remotification target\n", + (int)node_num); + ramster_remote_target_nodenum = (int)node_num; + } else { + pr_err("ramster: bad num to node node_num=%d?\n", + (int)node_num); + return -EINVAL; + } + return count; +} + +static struct kobj_attribute ramster_remote_target_nodenum_attr = { + .attr = { .name = "remote_target_nodenum", .mode = 0644 }, + .show = ramster_remote_target_nodenum_show, + .store = ramster_remote_target_nodenum_store, +}; + +#define RAMSTER_SYSFS_RO(_name) \ + static ssize_t ramster_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%lu\n", ramster_##_name); \ + } \ + static struct kobj_attribute ramster_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = ramster_##_name##_show, \ + } + +#define RAMSTER_SYSFS_RW(_name) \ + static ssize_t ramster_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%lu\n", ramster_##_name); \ + } \ + static ssize_t ramster_##_name##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, const char *buf, size_t count) \ + { \ + int err; \ + unsigned long enable; \ + err = kstrtoul(buf, 10, &enable); \ + if (err) \ + return -EINVAL; \ + ramster_##_name = enable; \ + return count; \ + } \ + static struct kobj_attribute ramster_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0644 }, \ + .show = ramster_##_name##_show, \ + .store = ramster_##_name##_store, \ + } + +#define RAMSTER_SYSFS_RO_ATOMIC(_name) \ + static ssize_t ramster_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%d\n", atomic_read(&ramster_##_name)); \ + } \ + static struct kobj_attribute ramster_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = ramster_##_name##_show, \ + } + +RAMSTER_SYSFS_RO(interface_revision); +RAMSTER_SYSFS_RO_ATOMIC(remote_pers_pages); +RAMSTER_SYSFS_RW(pers_remotify_enable); +RAMSTER_SYSFS_RW(eph_remotify_enable); + +static struct attribute *ramster_attrs[] = { + &ramster_interface_revision_attr.attr, + &ramster_remote_pers_pages_attr.attr, + &ramster_manual_node_up_attr.attr, + &ramster_remote_target_nodenum_attr.attr, + &ramster_pers_remotify_enable_attr.attr, + &ramster_eph_remotify_enable_attr.attr, + NULL, +}; + +static struct attribute_group ramster_attr_group = { + .attrs = ramster_attrs, + .name = "ramster", +}; + +/* + * frontswap selfshrinking + */ + +/* In HZ, controls frequency of worker invocation. */ +static unsigned int selfshrink_interval __read_mostly = 5; +/* Enable/disable with sysfs. */ +static bool frontswap_selfshrinking __read_mostly; + +static void selfshrink_process(struct work_struct *work); +static DECLARE_DELAYED_WORK(selfshrink_worker, selfshrink_process); + +/* Enable/disable with kernel boot option. */ +static bool use_frontswap_selfshrink __initdata = true; + +/* + * The default values for the following parameters were deemed reasonable + * by experimentation, may be workload-dependent, and can all be + * adjusted via sysfs. + */ + +/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ +static unsigned int frontswap_hysteresis __read_mostly = 20; + +/* + * Number of selfshrink worker invocations to wait before observing that + * frontswap selfshrinking should commence. Note that selfshrinking does + * not use a separate worker thread. + */ +static unsigned int frontswap_inertia __read_mostly = 3; + +/* Countdown to next invocation of frontswap_shrink() */ +static unsigned long frontswap_inertia_counter; + +/* + * Invoked by the selfshrink worker thread, uses current number of pages + * in frontswap (frontswap_curr_pages()), previous status, and control + * values (hysteresis and inertia) to determine if frontswap should be + * shrunk and what the new frontswap size should be. Note that + * frontswap_shrink is essentially a partial swapoff that immediately + * transfers pages from the "swap device" (frontswap) back into kernel + * RAM; despite the name, frontswap "shrinking" is very different from + * the "shrinker" interface used by the kernel MM subsystem to reclaim + * memory. + */ +static void frontswap_selfshrink(void) +{ + static unsigned long cur_frontswap_pages; + static unsigned long last_frontswap_pages; + static unsigned long tgt_frontswap_pages; + + last_frontswap_pages = cur_frontswap_pages; + cur_frontswap_pages = frontswap_curr_pages(); + if (!cur_frontswap_pages || + (cur_frontswap_pages > last_frontswap_pages)) { + frontswap_inertia_counter = frontswap_inertia; + return; + } + if (frontswap_inertia_counter && --frontswap_inertia_counter) + return; + if (cur_frontswap_pages <= frontswap_hysteresis) + tgt_frontswap_pages = 0; + else + tgt_frontswap_pages = cur_frontswap_pages - + (cur_frontswap_pages / frontswap_hysteresis); + frontswap_shrink(tgt_frontswap_pages); +} + +static int __init ramster_nofrontswap_selfshrink_setup(char *s) +{ + use_frontswap_selfshrink = false; + return 1; +} + +__setup("noselfshrink", ramster_nofrontswap_selfshrink_setup); + +static void selfshrink_process(struct work_struct *work) +{ + if (frontswap_selfshrinking && frontswap_enabled) { + frontswap_selfshrink(); + schedule_delayed_work(&selfshrink_worker, + selfshrink_interval * HZ); + } +} + +void ramster_cpu_up(int cpu) +{ + unsigned char *p1 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT); + unsigned char *p2 = kzalloc(PAGE_SIZE, GFP_KERNEL | __GFP_REPEAT); + BUG_ON(!p1 || !p2); + per_cpu(ramster_remoteputmem1, cpu) = p1; + per_cpu(ramster_remoteputmem2, cpu) = p2; +} + +void ramster_cpu_down(int cpu) +{ + struct ramster_preload *kp; + + kfree(per_cpu(ramster_remoteputmem1, cpu)); + per_cpu(ramster_remoteputmem1, cpu) = NULL; + kfree(per_cpu(ramster_remoteputmem2, cpu)); + per_cpu(ramster_remoteputmem2, cpu) = NULL; + kp = &per_cpu(ramster_preloads, cpu); + if (kp->flnode) { + kmem_cache_free(ramster_flnode_cache, kp->flnode); + kp->flnode = NULL; + } +} + +void ramster_register_pamops(struct tmem_pamops *pamops) +{ + pamops->free_obj = ramster_pampd_free_obj; + pamops->new_obj = ramster_pampd_new_obj; + pamops->replace_in_obj = ramster_pampd_replace_in_obj; + pamops->is_remote = ramster_pampd_is_remote; + pamops->repatriate = ramster_pampd_repatriate; + pamops->repatriate_preload = ramster_pampd_repatriate_preload; +} + +void __init ramster_init(bool cleancache, bool frontswap, + bool frontswap_exclusive_gets) +{ + int ret = 0; + + if (cleancache) + use_cleancache = true; + if (frontswap) + use_frontswap = true; + if (frontswap_exclusive_gets) + use_frontswap_exclusive_gets = true; + ramster_debugfs_init(); + ret = sysfs_create_group(mm_kobj, &ramster_attr_group); + if (ret) + pr_err("ramster: can't create sysfs for ramster\n"); + (void)r2net_register_handlers(); + INIT_LIST_HEAD(&ramster_rem_op_list); + ramster_flnode_cache = kmem_cache_create("ramster_flnode", + sizeof(struct flushlist_node), 0, 0, NULL); + frontswap_selfshrinking = use_frontswap_selfshrink; + if (frontswap_selfshrinking) { + pr_info("ramster: Initializing frontswap selfshrink driver.\n"); + schedule_delayed_work(&selfshrink_worker, + selfshrink_interval * HZ); + } + ramster_remotify_init(); +} |