From d29adb34a94715174c88ca93e8aba955850c9bde Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 2 Dec 2013 19:11:48 -0800 Subject: libceph: block I/O when PAUSE or FULL osd map flags are set The PAUSEWR and PAUSERD flags are meant to stop the cluster from processing writes and reads, respectively. The FULL flag is set when the cluster determines that it is out of space, and will no longer process writes. PAUSEWR and PAUSERD are purely client-side settings already implemented in userspace clients. The osd does nothing special with these flags. When the FULL flag is set, however, the osd responds to all writes with -ENOSPC. For cephfs, this makes sense, but for rbd the block layer translates this into EIO. If a cluster goes from full to non-full quickly, a filesystem on top of rbd will not behave well, since some writes succeed while others get EIO. Fix this by blocking any writes when the FULL flag is set in the osd client. This is the same strategy used by userspace, so apply it by default. A follow-on patch makes this configurable. __map_request() is called to re-target osd requests in case the available osds changed. Add a paused field to a ceph_osd_request, and set it whenever an appropriate osd map flag is set. Avoid queueing paused requests in __map_request(), but force them to be resent if they become unpaused. Also subscribe to the next osd map from the monitor if any of these flags are set, so paused requests can be unblocked as soon as possible. Fixes: http://tracker.ceph.com/issues/6079 Reviewed-by: Sage Weil Signed-off-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 8f47625a0661..4fb6a8938957 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -138,6 +138,7 @@ struct ceph_osd_request { __le64 *r_request_pool; void *r_request_pgid; __le32 *r_request_attempts; + bool r_paused; struct ceph_eversion *r_request_reassert_version; int r_result; -- cgit v1.2.3 From 12b4629a9fb80fecaebadc217b13b8776ed8dbef Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:23 +0200 Subject: libceph: all features fields must be u64 In preparation for ceph_features.h update, change all features fields from unsigned int/u32 to u64. (ceph.git has ~40 feature bits at this point.) Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 14 +++++++------- fs/ceph/super.c | 4 ++-- include/linux/ceph/libceph.h | 8 ++++---- include/linux/ceph/messenger.h | 10 +++++----- net/ceph/ceph_common.c | 4 ++-- net/ceph/messenger.c | 4 ++-- 6 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d90861f45210..4a13f6e72069 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -63,7 +63,7 @@ static const struct ceph_connection_operations mds_con_ops; */ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, - int features) + u64 features) { int err = -EIO; @@ -98,7 +98,7 @@ bad: */ static int parse_reply_info_trace(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { int err; @@ -145,7 +145,7 @@ out_bad: */ static int parse_reply_info_dir(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { u32 num, i = 0; int err; @@ -217,7 +217,7 @@ out_bad: */ static int parse_reply_info_filelock(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; @@ -238,7 +238,7 @@ bad: */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { if (*p == end) { @@ -262,7 +262,7 @@ bad: */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); @@ -280,7 +280,7 @@ static int parse_reply_info_extra(void **p, void *end, */ static int parse_reply_info(struct ceph_msg *msg, struct ceph_mds_reply_info_parsed *info, - int features) + u64 features) { void *p, *end; u32 len; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c6740e43a351..2df963f1cf5a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -490,10 +490,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const unsigned supported_features = + const u64 supported_features = CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH; - const unsigned required_features = 0; + const u64 required_features = 0; int page_count; size_t size; int err = -ENOMEM; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 2e3024881a5e..7d704db60cbb 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -122,8 +122,8 @@ struct ceph_client { int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *); - u32 supported_features; - u32 required_features; + u64 supported_features; + u64 required_features; struct ceph_messenger msgr; /* messenger instance */ struct ceph_mon_client monc; @@ -192,8 +192,8 @@ extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); extern struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, - unsigned supported_features, - unsigned required_features); + u64 supported_features, + u64 required_features); extern u64 ceph_client_id(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); extern int __ceph_open_session(struct ceph_client *client, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 7c1420bb1dce..c1d3f5a65273 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -60,8 +60,8 @@ struct ceph_messenger { u32 global_seq; spinlock_t global_seq_lock; - u32 supported_features; - u32 required_features; + u64 supported_features; + u64 required_features; }; enum ceph_msg_data_type { @@ -192,7 +192,7 @@ struct ceph_connection { struct ceph_entity_name peer_name; /* peer name */ - unsigned peer_features; + u64 peer_features; u32 connect_seq; /* identify the most recent connection attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ @@ -256,8 +256,8 @@ extern void ceph_msgr_flush(void); extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, + u64 supported_features, + u64 required_features, bool nocrc); extern void ceph_con_init(struct ceph_connection *con, void *private, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 34b11ee8124e..43d8177a52e1 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -461,8 +461,8 @@ EXPORT_SYMBOL(ceph_client_id); * create a fresh client instance */ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, - unsigned int supported_features, - unsigned int required_features) + u64 supported_features, + u64 required_features) { struct ceph_client *client; struct ceph_entity_addr *myaddr = NULL; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 4a5df7b1cc9f..260670558746 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2853,8 +2853,8 @@ static void con_fault(struct ceph_connection *con) */ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, - u32 supported_features, - u32 required_features, + u64 supported_features, + u64 required_features, bool nocrc) { msgr->supported_features = supported_features; -- cgit v1.2.3 From 2b3e0c905af43cfe402a2ef3f800be5dc1684005 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: libceph: update ceph_features.h This updates ceph_features.h so that it has all feature bits defined in ceph.git. In the interim since the last update, ceph.git crossed the "32 feature bits" point, and, the addition of the 33rd bit wasn't handled correctly. The work-around is squashed into this commit and reflects ceph.git commit 053659d05e0349053ef703b414f44965f368b9f0. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 96 +++++++++++++++++++++++++------------- net/ceph/messenger.c | 4 +- 2 files changed, 67 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 4c42080347af..003ea719be65 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -4,42 +4,73 @@ /* * feature bits */ -#define CEPH_FEATURE_UID (1<<0) -#define CEPH_FEATURE_NOSRCADDR (1<<1) -#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) -#define CEPH_FEATURE_FLOCK (1<<3) -#define CEPH_FEATURE_SUBSCRIBE2 (1<<4) -#define CEPH_FEATURE_MONNAMES (1<<5) -#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) -#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) -#define CEPH_FEATURE_OBJECTLOCATOR (1<<8) -#define CEPH_FEATURE_PGID64 (1<<9) -#define CEPH_FEATURE_INCSUBOSDMAP (1<<10) -#define CEPH_FEATURE_PGPOOL3 (1<<11) -#define CEPH_FEATURE_OSDREPLYMUX (1<<12) -#define CEPH_FEATURE_OSDENC (1<<13) -#define CEPH_FEATURE_OMAP (1<<14) -#define CEPH_FEATURE_MONENC (1<<15) -#define CEPH_FEATURE_QUERY_T (1<<16) -#define CEPH_FEATURE_INDEP_PG_MAP (1<<17) -#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) -#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19) -#define CEPH_FEATURE_MON_NULLROUTE (1<<20) -#define CEPH_FEATURE_MON_GV (1<<21) -#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22) -#define CEPH_FEATURE_MSG_AUTH (1<<23) -#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24) -#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) -#define CEPH_FEATURE_CREATEPOOLID (1<<26) -#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) -#define CEPH_FEATURE_OSD_HBMSGS (1<<28) -#define CEPH_FEATURE_MDSENC (1<<29) -#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30) +#define CEPH_FEATURE_UID (1ULL<<0) +#define CEPH_FEATURE_NOSRCADDR (1ULL<<1) +#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2) +#define CEPH_FEATURE_FLOCK (1ULL<<3) +#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) +#define CEPH_FEATURE_MONNAMES (1ULL<<5) +#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) +#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7) +#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) +#define CEPH_FEATURE_PGID64 (1ULL<<9) +#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) +#define CEPH_FEATURE_PGPOOL3 (1ULL<<11) +#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) +#define CEPH_FEATURE_OSDENC (1ULL<<13) +#define CEPH_FEATURE_OMAP (1ULL<<14) +#define CEPH_FEATURE_MONENC (1ULL<<15) +#define CEPH_FEATURE_QUERY_T (1ULL<<16) +#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17) +#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18) +#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19) +#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20) +#define CEPH_FEATURE_MON_GV (1ULL<<21) +#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22) +#define CEPH_FEATURE_MSG_AUTH (1ULL<<23) +#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24) +#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25) +#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26) +#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27) +#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28) +#define CEPH_FEATURE_MDSENC (1ULL<<29) +#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30) +#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31) +#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) +#define CEPH_FEATURE_MON_SCRUB (1ULL<<33) +#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34) +#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35) +#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ +#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) +#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) + +/* + * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature + * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 + * to mean 33 bit ~0, and introduce a helper below to do the + * translation. + * + * This was introduced by ceph.git commit + * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8 + * and fixed by ceph.git commit + * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c + */ +#define CEPH_FEATURE_RESERVED (1ULL<<63) + +static inline u64 ceph_sanitize_features(u64 features) +{ + if (features & CEPH_FEATURE_RESERVED) { + /* everything through OSD_SNAPMAPPER */ + return 0x1ffffffffull; + } else { + return features; + } +} /* * Features supported. */ -#define CEPH_FEATURES_SUPPORTED_DEFAULT \ +#define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ @@ -56,4 +87,5 @@ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC) + #endif diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 260670558746..bd172e1ee0ae 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -1945,7 +1946,8 @@ static int process_connect(struct ceph_connection *con) { u64 sup_feat = con->msgr->supported_features; u64 req_feat = con->msgr->required_features; - u64 server_feat = le64_to_cpu(con->in_reply.features); + u64 server_feat = ceph_sanitize_features( + le64_to_cpu(con->in_reply.features)); int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); -- cgit v1.2.3 From b3b33b0e43323af4fb697f4378218d3c268d02cd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: crush: pass weight vector size to map function Pass the size of the weight vector into crush_do_rule() to ensure that we don't access values past the end. This can happen if the caller misbehaves and passes a weight vector that is smaller than max_devices. Currently the monitor tries to prevent that from happening, but this will gracefully tolerate previous bad osdmaps that got into this state. It's also a bit more defensive. Reflects ceph.git commit 5922e2c2b8335b5e46c9504349c3a55b7434c01a. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/mapper.h | 2 +- net/ceph/crush/mapper.c | 17 ++++++++++++----- net/ceph/osdmap.c | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index 5772dee3ecbf..69310b031875 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -14,6 +14,6 @@ extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, i extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weights); + const __u32 *weights, int weight_max); #endif diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index cbd06a91941c..18d2cf66f102 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -264,8 +264,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r) * true if device is marked "out" (failed, fully offloaded) * of the cluster */ -static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x) +static int is_out(const struct crush_map *map, + const __u32 *weight, int weight_max, + int item, int x) { + if (item >= weight_max) + return 1; if (weight[item] >= 0x10000) return 0; if (weight[item] == 0) @@ -292,7 +296,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in */ static int crush_choose(const struct crush_map *map, struct crush_bucket *bucket, - const __u32 *weight, + const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, int firstn, int recurse_to_leaf, @@ -396,7 +400,7 @@ static int crush_choose(const struct crush_map *map, if (item < 0) { if (crush_choose(map, map->buckets[-1-item], - weight, + weight, weight_max, x, outpos+1, 0, out2, outpos, firstn, 0, @@ -414,6 +418,7 @@ static int crush_choose(const struct crush_map *map, /* out? */ if (itemtype == 0) reject = is_out(map, weight, + weight_max, item, x); else reject = 0; @@ -470,10 +475,12 @@ reject: * @x: hash input * @result: pointer to result vector * @result_max: maximum result size + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weight) + const __u32 *weight, int weight_max) { int result_len; int a[CRUSH_MAX_SET]; @@ -545,7 +552,7 @@ int crush_do_rule(const struct crush_map *map, j = 0; osize += crush_choose(map, map->buckets[-1-w[i]], - weight, + weight, weight_max, x, numrep, curstep->arg2, o+osize, j, diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index dbd9a4792427..6477a68ddecb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1165,7 +1165,7 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, } r = crush_do_rule(osdmap->crush, ruleno, pps, osds, min_t(int, pool->size, *num), - osdmap->osd_weight); + osdmap->osd_weight, osdmap->max_osd); if (r < 0) { pr_err("error %d from crush rule: pool %lld ruleset %d type %d" " size %d\n", r, pgid.pool, pool->crush_ruleset, -- cgit v1.2.3 From bfb16d7d69f0272451ad85a6e50aab3c4262fbc0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: crush: factor out (trivial) crush_destroy_rule() Reflects ceph.git commit 43a01c9973c4b83f2eaa98be87429941a227ddde. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 1 + net/ceph/crush/crush.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 6a1101f24cfb..09561a04c127 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -174,6 +174,7 @@ extern void crush_destroy_bucket_list(struct crush_bucket_list *b); extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); extern void crush_destroy_bucket(struct crush_bucket *b); +extern void crush_destroy_rule(struct crush_rule *r); extern void crush_destroy(struct crush_map *map); static inline int crush_calc_tree_node(int i) diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 089613234f03..16bc199d9a62 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -116,11 +116,14 @@ void crush_destroy(struct crush_map *map) if (map->rules) { __u32 b; for (b = 0; b < map->max_rules; b++) - kfree(map->rules[b]); + crush_destroy_rule(map->rules[b]); kfree(map->rules); } kfree(map); } - +void crush_destroy_rule(struct crush_rule *rule) +{ + kfree(rule); +} -- cgit v1.2.3 From e8ef19c4ad161768e1d8309d5ae18481c098eb81 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:24 +0200 Subject: crush: eliminate CRUSH_MAX_SET result size limitation This is only present to size the temporary scratch arrays that we put on the stack. Let the caller allocate them as they wish and remove the limitation. Reflects ceph.git commit 1cfe140bf2dab99517589a82a916f4c75b9492d1. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 1 - include/linux/crush/mapper.h | 3 ++- net/ceph/crush/mapper.c | 10 ++++++---- net/ceph/osdmap.c | 16 +++++++++++++--- 4 files changed, 21 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 09561a04c127..83543c504b5a 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -21,7 +21,6 @@ #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ -#define CRUSH_MAX_SET 10 /* max size of a mapping result */ /* diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index 69310b031875..eab367446eea 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -14,6 +14,7 @@ extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, i extern int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weights, int weight_max); + const __u32 *weights, int weight_max, + int *scratch); #endif diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 82cab7d3e89d..dcf48bc504ea 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -478,15 +478,17 @@ reject: * @result_max: maximum result size * @weight: weight vector (for map leaves) * @weight_max: size of weight vector + * @scratch: scratch vector for private use; must be >= 3 * result_max */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, - const __u32 *weight, int weight_max) + const __u32 *weight, int weight_max, + int *scratch) { int result_len; - int a[CRUSH_MAX_SET]; - int b[CRUSH_MAX_SET]; - int c[CRUSH_MAX_SET]; + int *a = scratch; + int *b = scratch + result_max; + int *c = scratch + result_max*2; int recurse_to_leaf; int *w; int wsize = 0; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 6477a68ddecb..8b1a6b48bb5d 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1110,6 +1110,16 @@ int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, } EXPORT_SYMBOL(ceph_calc_ceph_pg); +static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, + int *result, int result_max, + const __u32 *weight, int weight_max) +{ + int scratch[result_max * 3]; + + return crush_do_rule(map, ruleno, x, result, result_max, + weight, weight_max, scratch); +} + /* * Calculate raw osd vector for the given pgid. Return pointer to osd * array, or NULL on failure. @@ -1163,9 +1173,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, pool->pgp_num_mask) + (unsigned)pgid.pool; } - r = crush_do_rule(osdmap->crush, ruleno, pps, osds, - min_t(int, pool->size, *num), - osdmap->osd_weight, osdmap->max_osd); + r = crush_do_rule_ary(osdmap->crush, ruleno, pps, + osds, min_t(int, pool->size, *num), + osdmap->osd_weight, osdmap->max_osd); if (r < 0) { pr_err("error %d from crush rule: pool %lld ruleset %d type %d" " size %d\n", r, pgid.pool, pool->crush_ruleset, -- cgit v1.2.3 From c6d98a603a02594f6ecf16d0a0af989ae9fa7abd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:25 +0200 Subject: crush: return CRUSH_ITEM_UNDEF for failed placements with indep For firstn mode, if we fail to make a valid placement choice, we just continue and return a short result to the caller. For indep mode, however, we need to make the position stable, and return an undefined value on failed placements to avoid shifting later results to the left. Reflects ceph.git commit b1d4dd4eb044875874a1d01c01c7d766db5d0a80. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 3 ++- net/ceph/crush/mapper.c | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 83543c504b5a..3d6a12928560 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -19,10 +19,11 @@ #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ - #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ +#define CRUSH_ITEM_UNDEF 0x7fffffff /* undefined result */ + /* * CRUSH uses user-defined "rules" to describe how inputs should be * mapped to devices. A rule consists of sequence of steps to perform diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index dcf48bc504ea..a8605245d190 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -455,8 +455,12 @@ reject: } while (retry_descent); if (skip_rep) { - dprintk("skip rep\n"); - continue; + if (firstn) { + dprintk("skip rep\n"); + continue; + } + dprintk("undef rep, continuing\n"); + item = CRUSH_ITEM_UNDEF; } dprintk("CHOOSE got %d\n", item); -- cgit v1.2.3 From 9a3b490a20e06368c81d7a81506e99388e733379 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:25 +0200 Subject: crush: use breadth-first search for indep mode Reflects ceph.git commit 86e978036a4ecbac4c875e7c00f6c5bbe37282d3. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 3 +- net/ceph/crush/mapper.c | 172 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 165 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 3d6a12928560..4023b1b52296 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -22,7 +22,8 @@ #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ -#define CRUSH_ITEM_UNDEF 0x7fffffff /* undefined result */ +#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ +#define CRUSH_ITEM_NONE 0x7fffffff /* no result */ /* * CRUSH uses user-defined "rules" to describe how inputs should be diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a8605245d190..caeb1066bea3 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -473,6 +473,148 @@ reject: } +/** + * choose indep: alternative breadth-first positionally stable mapping + * + */ +static void crush_choose_indep(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int numrep, int type, + int *out, int outpos, + int recurse_to_leaf, + int *out2) +{ + struct crush_bucket *in = bucket; + int left = numrep - outpos; + int rep; + unsigned int ftotal; + int r; + int i; + int item = 0; + int itemtype; + int collide; + + dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); + + /* initially my result is undefined */ + for (rep = outpos; rep < numrep; rep++) { + out[rep] = CRUSH_ITEM_UNDEF; + if (out2) + out2[rep] = CRUSH_ITEM_UNDEF; + } + + for (ftotal = 0; left > 0 && ftotal < map->choose_total_tries; ftotal++) { + for (rep = outpos; rep < numrep; rep++) { + if (out[rep] != CRUSH_ITEM_UNDEF) + continue; + + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + for (;;) { + r = rep; + + /* be careful */ + if (in->alg == CRUSH_BUCKET_UNIFORM && + in->size % numrep == 0) + /* r'=r+(n+1)*f_total */ + r += (numrep+1) * ftotal; + else + /* r' = r + n*f_total */ + r += numrep * ftotal; + + /* bucket choose */ + if (in->size == 0) { + dprintk(" empty bucket\n"); + break; + } + + item = crush_bucket_choose(in, x, r); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = CRUSH_ITEM_NONE; + left--; + break; + } + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = + CRUSH_ITEM_NONE; + left--; + break; + } + in = map->buckets[-1-item]; + continue; + } + + /* collision? */ + collide = 0; + for (i = outpos; i < numrep; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + if (collide) + break; + + if (recurse_to_leaf) { + if (item < 0) { + crush_choose_indep(map, + map->buckets[-1-item], + weight, weight_max, + x, rep+1, 0, + out2, rep, + 0, NULL); + if (out2[rep] == CRUSH_ITEM_NONE) { + /* placed nothing; no leaf */ + break; + } + } else { + /* we already have a leaf! */ + out2[rep] = item; + } + } + + /* out? */ + if (itemtype == 0 && + is_out(map, weight, weight_max, item, x)) + break; + + /* yay! */ + out[rep] = item; + left--; + break; + } + } + } + for (rep = outpos; rep < numrep; rep++) { + if (out[rep] == CRUSH_ITEM_UNDEF) { + out[rep] = CRUSH_ITEM_NONE; + } + if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { + out2[rep] = CRUSH_ITEM_NONE; + } + } +} + /** * crush_do_rule - calculate a mapping with the given input and rule * @map: the crush_map @@ -556,15 +698,27 @@ int crush_do_rule(const struct crush_map *map, continue; } j = 0; - osize += crush_choose(map, - map->buckets[-1-w[i]], - weight, weight_max, - x, numrep, - curstep->arg2, - o+osize, j, - firstn, - recurse_to_leaf, - descend_once, c+osize); + if (firstn) { + osize += crush_choose(map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + firstn, + recurse_to_leaf, + descend_once, c+osize); + } else { + crush_choose_indep(map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + recurse_to_leaf, + c+osize); + osize += numrep; + } } if (recurse_to_leaf) -- cgit v1.2.3 From be3226acc5544bcc91e756eb3ee6ca7b74f6f0a8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:26 +0200 Subject: crush: new SET_CHOOSE_LEAF_TRIES command Explicitly control the number of sample attempts, and allow the number of tries in the recursive call to be explicitly controlled via the rule. This is important because the amount of time we want to spend looking for a solution may be rule dependent (e.g., higher for the wide indep pool than the rep pools). (We should do the same for the other tunables, by the way!) Reflects ceph.git commit c43c893be872f709c787bc57f46c0e97876ff681. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 2 ++ net/ceph/crush/mapper.c | 31 +++++++++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 4023b1b52296..2e50bab91655 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -46,6 +46,8 @@ enum { CRUSH_RULE_EMIT = 4, /* no args */ CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, + + CRUSH_RULE_SET_CHOOSE_LEAF_TRIES = 9, }; /* diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index c727836b5860..e3ade074541c 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -455,11 +455,13 @@ reject: * */ static void crush_choose_indep(const struct crush_map *map, - struct crush_bucket *bucket, - const __u32 *weight, int weight_max, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, int x, int left, int numrep, int type, - int *out, int outpos, - int recurse_to_leaf, + int *out, int outpos, + unsigned int attempts, + unsigned int recurse_attempts, + int recurse_to_leaf, int *out2, int parent_r) { @@ -483,7 +485,7 @@ static void crush_choose_indep(const struct crush_map *map, out2[rep] = CRUSH_ITEM_UNDEF; } - for (ftotal = 0; left > 0 && ftotal < map->choose_total_tries; ftotal++) { + for (ftotal = 0; left > 0 && ftotal < attempts; ftotal++) { for (rep = outpos; rep < endpos; rep++) { if (out[rep] != CRUSH_ITEM_UNDEF) continue; @@ -564,11 +566,12 @@ static void crush_choose_indep(const struct crush_map *map, if (recurse_to_leaf) { if (item < 0) { crush_choose_indep(map, - map->buckets[-1-item], - weight, weight_max, - x, 1, numrep, 0, - out2, rep, - 0, NULL, r); + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_attempts, 0, + 0, NULL, r); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -631,6 +634,7 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; + int choose_leaf_tries = 1; const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { @@ -653,6 +657,11 @@ int crush_do_rule(const struct crush_map *map, wsize = 1; break; + case CRUSH_RULE_SET_CHOOSE_LEAF_TRIES: + if (curstep->arg1 > 0) + choose_leaf_tries = curstep->arg1; + break; + case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -702,6 +711,8 @@ int crush_do_rule(const struct crush_map *map, x, numrep, numrep, curstep->arg2, o+osize, j, + map->choose_total_tries, + choose_leaf_tries, recurse_to_leaf, c+osize, 0); -- cgit v1.2.3 From f18650ace38ef200dd1578257c75e9407297953c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:26 +0200 Subject: crush: apply chooseleaf_tries to firstn mode too Parameterize the attempts for the _firstn choose method, and apply the rule-specified tries count to firstn mode as well. Note that we have slightly different behavior here than with indep: If the firstn value is not specified for firstn, we pass through the normal attempt count. This maintains compatibility with legacy behavior. Note that this is usually *not* actually N^2 work, though, because of the descend_once tunable. However, descend_once is unfortunately *not* the same thing as 1 chooseleaf try because it is only checked on a reject but not on a collision. Sigh. In contrast, for indep, if tries is not specified we default to 1 recursive attempt, because that is simply more sane, and we have the option to do so. The descend_once tunable has no effect for indep. Reflects ceph.git commit 64aeded50d80942d66a5ec7b604ff2fcbf5d7b63. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 5 ++++- net/ceph/crush/mapper.c | 14 ++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 2e50bab91655..07b8fd4f81fc 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -165,7 +165,10 @@ struct crush_map { __u32 choose_local_fallback_tries; /* choose attempts before giving up */ __u32 choose_total_tries; - /* attempt chooseleaf inner descent once; on failure retry outer descent */ + /* attempt chooseleaf inner descent once for firstn mode; on + * reject retry outer descent. Note that this does *not* + * apply to a collision: in that case we will retry as we used + * to. */ __u32 chooseleaf_descend_once; }; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index e3ade074541c..c34320518c8b 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -299,6 +299,8 @@ static int crush_choose_firstn(const struct crush_map *map, const __u32 *weight, int weight_max, int x, int numrep, int type, int *out, int outpos, + unsigned int attempts, + unsigned int recurse_attempts, int recurse_to_leaf, int descend_once, int *out2) { @@ -385,6 +387,7 @@ static int crush_choose_firstn(const struct crush_map *map, weight, weight_max, x, outpos+1, 0, out2, outpos, + recurse_attempts, 0, 0, map->chooseleaf_descend_once, NULL) <= outpos) @@ -421,7 +424,7 @@ reject: flocal <= in->size + map->choose_local_fallback_tries) /* exhaustive bucket search */ retry_bucket = 1; - else if (ftotal <= map->choose_total_tries) + else if (ftotal <= attempts) /* then retry descent */ retry_descent = 1; else @@ -634,7 +637,8 @@ int crush_do_rule(const struct crush_map *map, __u32 step; int i, j; int numrep; - int choose_leaf_tries = 1; + int choose_tries = map->choose_total_tries; + int choose_leaf_tries = 0; const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { @@ -701,6 +705,8 @@ int crush_do_rule(const struct crush_map *map, x, numrep, curstep->arg2, o+osize, j, + choose_tries, + choose_leaf_tries ? choose_leaf_tries : choose_tries, recurse_to_leaf, descend_once, c+osize); } else { @@ -711,8 +717,8 @@ int crush_do_rule(const struct crush_map *map, x, numrep, numrep, curstep->arg2, o+osize, j, - map->choose_total_tries, - choose_leaf_tries, + choose_tries, + choose_leaf_tries ? choose_leaf_tries : 1, recurse_to_leaf, c+osize, 0); -- cgit v1.2.3 From cc10df4a3a5c34cb1d5b635ac70dd1fc406153ce Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:26 +0200 Subject: crush: add SET_CHOOSE_TRIES rule step Since we can specify the recursive retries in a rule, we may as well also specify the non-recursive tries too for completeness. Reflects ceph.git commit d1b97462cffccc871914859eaee562f2786abfd1. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 3 ++- net/ceph/crush/mapper.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 07b8fd4f81fc..5f95969347ec 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -47,7 +47,8 @@ enum { CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, - CRUSH_RULE_SET_CHOOSE_LEAF_TRIES = 9, + CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ + CRUSH_RULE_SET_CHOOSE_LEAF_TRIES = 9, /* override chooseleaf_descend_once */ }; /* diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index c34320518c8b..a1acdea935bf 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -661,6 +661,11 @@ int crush_do_rule(const struct crush_map *map, wsize = 1; break; + case CRUSH_RULE_SET_CHOOSE_TRIES: + if (curstep->arg1 > 0) + choose_tries = curstep->arg1; + break; + case CRUSH_RULE_SET_CHOOSE_LEAF_TRIES: if (curstep->arg1 > 0) choose_leaf_tries = curstep->arg1; -- cgit v1.2.3 From 917edad5d1d62070436b74ecbf5ea019b651ff69 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:26 +0200 Subject: crush: CHOOSE_LEAF -> CHOOSELEAF throughout This aligns the internal identifier names with the user-visible names in the decompiled crush map language. Reflects ceph.git commit caa0e22e15e4226c3671318ba1f61314bf6da2a6. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 6 +++--- net/ceph/crush/mapper.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 5f95969347ec..7b0fc4aba75b 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -44,11 +44,11 @@ enum { /* arg2 = type */ CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ CRUSH_RULE_EMIT = 4, /* no args */ - CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6, - CRUSH_RULE_CHOOSE_LEAF_INDEP = 7, + CRUSH_RULE_CHOOSELEAF_FIRSTN = 6, + CRUSH_RULE_CHOOSELEAF_INDEP = 7, CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ - CRUSH_RULE_SET_CHOOSE_LEAF_TRIES = 9, /* override chooseleaf_descend_once */ + CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ }; /* diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index a1acdea935bf..e9256a30e60d 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -666,25 +666,25 @@ int crush_do_rule(const struct crush_map *map, choose_tries = curstep->arg1; break; - case CRUSH_RULE_SET_CHOOSE_LEAF_TRIES: + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: if (curstep->arg1 > 0) choose_leaf_tries = curstep->arg1; break; - case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; /* fall through */ - case CRUSH_RULE_CHOOSE_LEAF_INDEP: + case CRUSH_RULE_CHOOSELEAF_INDEP: case CRUSH_RULE_CHOOSE_INDEP: if (wsize == 0) break; recurse_to_leaf = curstep->op == - CRUSH_RULE_CHOOSE_LEAF_FIRSTN || + CRUSH_RULE_CHOOSELEAF_FIRSTN || curstep->op == - CRUSH_RULE_CHOOSE_LEAF_INDEP; + CRUSH_RULE_CHOOSELEAF_INDEP; /* reset output */ osize = 0; -- cgit v1.2.3 From f046bf92080cbdc4a94c6e86698c5a3f10716445 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:27 +0200 Subject: crush: add set_choose_local_[fallback_]tries steps This allows all of the tunables to be overridden by a specific rule. Reflects ceph.git commits d129e09e57fbc61cfd4f492e3ee77d0750c9d292, 0497db49e5973b50df26251ed0e3f4ac7578e66e. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/crush/crush.h | 2 ++ net/ceph/crush/mapper.c | 28 +++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 7b0fc4aba75b..acaa5615d634 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -49,6 +49,8 @@ enum { CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ + CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, + CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, }; /* diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 0613dd2d5fa3..8cde4818e18b 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -300,6 +300,8 @@ static int crush_choose_firstn(const struct crush_map *map, int *out, int outpos, unsigned int attempts, unsigned int recurse_attempts, + unsigned int local_tries, + unsigned int local_fallback_tries, int recurse_to_leaf, int *out2) { @@ -338,9 +340,9 @@ static int crush_choose_firstn(const struct crush_map *map, reject = 1; goto reject; } - if (map->choose_local_fallback_tries > 0 && + if (local_fallback_tries > 0 && flocal >= (in->size>>1) && - flocal > map->choose_local_fallback_tries) + flocal > local_fallback_tries) item = bucket_perm_choose(in, x, r); else item = crush_bucket_choose(in, x, r); @@ -387,6 +389,8 @@ static int crush_choose_firstn(const struct crush_map *map, x, outpos+1, 0, out2, outpos, recurse_attempts, 0, + local_tries, + local_fallback_tries, 0, NULL) <= outpos) /* didn't get leaf */ @@ -412,11 +416,11 @@ reject: ftotal++; flocal++; - if (collide && flocal <= map->choose_local_tries) + if (collide && flocal <= local_tries) /* retry locally a few times */ retry_bucket = 1; - else if (map->choose_local_fallback_tries > 0 && - flocal <= in->size + map->choose_local_fallback_tries) + else if (local_fallback_tries > 0 && + flocal <= in->size + local_fallback_tries) /* exhaustive bucket search */ retry_bucket = 1; else if (ftotal <= attempts) @@ -633,6 +637,8 @@ int crush_do_rule(const struct crush_map *map, int i, j; int numrep; int choose_tries = map->choose_total_tries; + int choose_local_tries = map->choose_local_tries; + int choose_local_fallback_tries = map->choose_local_fallback_tries; int choose_leaf_tries = 0; if ((__u32)ruleno >= map->max_rules) { @@ -665,6 +671,16 @@ int crush_do_rule(const struct crush_map *map, choose_leaf_tries = curstep->arg1; break; + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: + if (curstep->arg1 > 0) + choose_local_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: + if (curstep->arg1 > 0) + choose_local_fallback_tries = curstep->arg1; + break; + case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN: firstn = 1; @@ -714,6 +730,8 @@ int crush_do_rule(const struct crush_map *map, o+osize, j, choose_tries, recurse_tries, + choose_local_tries, + choose_local_fallback_tries, recurse_to_leaf, c+osize); } else { -- cgit v1.2.3 From cdff49918c8286ac18593e742ead25242c76c81d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 24 Dec 2013 21:19:27 +0200 Subject: crush: support new indep mode and SET_* steps (crush v2) by default Add CRUSH_V2 feature (new indep mode and SET_* steps) to a set of features supported by default. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 003ea719be65..5f42e4412909 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -79,7 +79,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ - CEPH_FEATURE_OSDHASHPSPOOL) + CEPH_FEATURE_OSDHASHPSPOOL | \ + CEPH_FEATURE_CRUSH_V2) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.3 From 3cea4c3071d4e55e9d7356efe9d0ebf92f0c2204 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Jan 2014 20:08:21 +0200 Subject: libceph: rename ceph_msg::front_max to front_alloc_len Rename front_max field of struct ceph_msg to front_alloc_len to make its purpose more clear. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 6 +++--- net/ceph/mon_client.c | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c1d3f5a65273..861138f7c161 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -157,7 +157,7 @@ struct ceph_msg { bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; - int front_max; + int front_alloc_len; unsigned long ack_stamp; /* tx: when we were acked */ struct ceph_msgpool *pool; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d2cadb5b2b63..f4d411c017e7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3130,7 +3130,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, INIT_LIST_HEAD(&m->data); /* front */ - m->front_max = front_len; + m->front_alloc_len = front_len; if (front_len) { if (front_len > PAGE_CACHE_SIZE) { m->front.iov_base = __vmalloc(front_len, flags, @@ -3305,8 +3305,8 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, - msg->front_max, msg->data_length); + pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, + msg->front_alloc_len, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 1fe25cd29d0e..2ac9ef35110b 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -152,7 +152,7 @@ static int __open_session(struct ceph_mon_client *monc) /* initiatiate authentication handshake */ ret = ceph_auth_build_hello(monc->auth, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); __send_prepared_auth_request(monc, ret); } else { dout("open_session mon%d already open\n", monc->cur_mon); @@ -196,7 +196,7 @@ static void __send_subscribe(struct ceph_mon_client *monc) int num; p = msg->front.iov_base; - end = p + msg->front_max; + end = p + msg->front_alloc_len; num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; ceph_encode_32(&p, num); @@ -897,7 +897,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); if (ret < 0) { monc->client->auth_err = ret; wake_up_all(&monc->client->auth_wq); @@ -939,7 +939,7 @@ static int __validate_auth(struct ceph_mon_client *monc) return 0; ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, - monc->m_auth->front_max); + monc->m_auth->front_alloc_len); if (ret <= 0) return ret; /* either an error, or no need to authenticate */ __send_prepared_auth_request(monc, ret); -- cgit v1.2.3 From 186e4f7a4b1883f3f46aa15366c0bcebc28fdda7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 22 Nov 2013 14:48:37 +0800 Subject: ceph: handle session flush message Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 19 +++++++++++++++++++ fs/ceph/strings.c | 2 ++ include/linux/ceph/ceph_fs.h | 2 ++ 3 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1fd655ac806a..7c00dd530bd1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1137,6 +1137,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, return 0; } +static int send_flushmsg_ack(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, u64 seq) +{ + struct ceph_msg *msg; + + dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", + session->s_mds, session_state_name(session->s_state), seq); + msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + if (!msg) + return -ENOMEM; + ceph_con_send(&session->s_con, msg); + return 0; +} + + /* * Note new cap ttl, and any transition from stale -> not stale (fresh?). * @@ -2396,6 +2411,10 @@ static void handle_session(struct ceph_mds_session *session, trim_caps(mdsc, session, le32_to_cpu(h->max_caps)); break; + case CEPH_SESSION_FLUSHMSG: + send_flushmsg_ack(mdsc, session, seq); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 89fa4a940a0f..4440f447fd3f 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_RENEWCAPS: return "renewcaps"; case CEPH_SESSION_STALE: return "stale"; case CEPH_SESSION_RECALL_STATE: return "recall_state"; + case CEPH_SESSION_FLUSHMSG: return "flushmsg"; + case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; } return "???"; } diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 2ad7b860f062..26bb587deb78 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -282,6 +282,8 @@ enum { CEPH_SESSION_RENEWCAPS, CEPH_SESSION_STALE, CEPH_SESSION_RECALL_STATE, + CEPH_SESSION_FLUSHMSG, + CEPH_SESSION_FLUSHMSG_ACK, }; extern const char *ceph_session_op_name(int op); -- cgit v1.2.3 From 4ee6a914edbbd2543884f0ad7d58ea471136be32 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 24 Nov 2013 14:43:46 +0800 Subject: ceph: remove exported caps when handling cap import message Version 3 cap import message includes the ID of the exported caps. It allow us to remove the exported caps if we still haven't received the corresponding cap export message. We remove the exported caps because they are stale, keeping them can compromise consistence. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 79 +++++++++++++++++++++++++++++--------------- include/linux/ceph/ceph_fs.h | 11 +++++- 2 files changed, 62 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index d65ff334901c..98f3ca4a5ddf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -611,6 +611,7 @@ retry: if (ci->i_auth_cap == NULL || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) ci->i_auth_cap = cap; + ci->i_cap_exporting_issued = 0; } else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; spin_lock(&mdsc->cap_dirty_lock); @@ -2823,10 +2824,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, + struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, void *snaptrace, int snaptrace_len) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap *cap; int mds = session->s_mds; unsigned issued = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); @@ -2834,28 +2837,44 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); + u64 p_cap_id; + int peer; - if (ci->i_cap_exporting_mds >= 0 && - ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) { - dout("handle_cap_import inode %p ci %p mds%d mseq %d" - " - cleared exporting from mds%d\n", - inode, ci, mds, mseq, - ci->i_cap_exporting_mds); - ci->i_cap_exporting_issued = 0; - ci->i_cap_exporting_mseq = 0; - ci->i_cap_exporting_mds = -1; + if (ph) { + p_cap_id = le64_to_cpu(ph->cap_id); + peer = le32_to_cpu(ph->mds); + } else { + p_cap_id = 0; + peer = -1; + } - spin_lock(&mdsc->cap_dirty_lock); - if (!list_empty(&ci->i_dirty_item)) { - dout(" moving %p back to cap_dirty\n", inode); - list_move(&ci->i_dirty_item, &mdsc->cap_dirty); + dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", + inode, ci, mds, mseq, peer); + + spin_lock(&ci->i_ceph_lock); + cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; + if (cap && cap->cap_id == p_cap_id) { + dout(" remove export cap %p mds%d flags %d\n", + cap, peer, ph->flags); + if ((ph->flags & CEPH_CAP_FLAG_AUTH) && + (cap->seq != le32_to_cpu(ph->seq) || + cap->mseq != le32_to_cpu(ph->mseq))) { + pr_err("handle_cap_import: mismatched seq/mseq: " + "ino (%llx.%llx) mds%d seq %d mseq %d " + "importer mds%d has peer seq %d mseq %d\n", + ceph_vinop(inode), peer, cap->seq, + cap->mseq, mds, le32_to_cpu(ph->seq), + le32_to_cpu(ph->mseq)); } - spin_unlock(&mdsc->cap_dirty_lock); - } else { - dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", - inode, ci, mds, mseq); + ci->i_cap_exporting_issued = cap->issued; + __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } + /* make sure we re-request max_size, if necessary */ + ci->i_wanted_max_size = 0; + ci->i_requested_max_size = 0; + spin_unlock(&ci->i_ceph_lock); + down_write(&mdsc->snap_rwsem); ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, false); @@ -2866,11 +2885,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, kick_flushing_inode_caps(mdsc, session, inode); up_read(&mdsc->snap_rwsem); - /* make sure we re-request max_size, if necessary */ - spin_lock(&ci->i_ceph_lock); - ci->i_wanted_max_size = 0; /* reset */ - ci->i_requested_max_size = 0; - spin_unlock(&ci->i_ceph_lock); } /* @@ -2888,6 +2902,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; + struct ceph_mds_cap_peer *peer = NULL; int mds = session->s_mds; int op; u32 seq, mseq; @@ -2898,12 +2913,14 @@ void ceph_handle_caps(struct ceph_mds_session *session, void *snaptrace; size_t snaptrace_len; void *flock; + void *end; u32 flock_len; int open_target_sessions = 0; dout("handle_caps from mds%d\n", mds); /* decode */ + end = msg->front.iov_base + msg->front.iov_len; tid = le64_to_cpu(msg->hdr.tid); if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -2921,17 +2938,25 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace_len = le32_to_cpu(h->snap_trace_len); if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p, *end; - - p = snaptrace + snaptrace_len; - end = msg->front.iov_base + msg->front.iov_len; + void *p = snaptrace + snaptrace_len; ceph_decode_32_safe(&p, end, flock_len, bad); + if (p + flock_len > end) + goto bad; flock = p; } else { flock = NULL; flock_len = 0; } + if (le16_to_cpu(msg->hdr.version) >= 3) { + if (op == CEPH_CAP_OP_IMPORT) { + void *p = flock + flock_len; + if (p + sizeof(*peer) > end) + goto bad; + peer = p; + } + } + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, @@ -2968,7 +2993,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done; case CEPH_CAP_OP_IMPORT: - handle_cap_import(mdsc, inode, h, session, + handle_cap_import(mdsc, inode, h, peer, session, snaptrace, snaptrace_len); } diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 26bb587deb78..0a37b989b52f 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -459,7 +459,8 @@ struct ceph_mds_reply_cap { __u8 flags; /* CEPH_CAP_FLAG_* */ } __attribute__ ((packed)); -#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */ +#define CEPH_CAP_FLAG_AUTH (1 << 0) /* cap is issued by auth mds */ +#define CEPH_CAP_FLAG_RELEASE (1 << 1) /* release the cap */ /* inode record, for bundling with mds reply */ struct ceph_mds_reply_inode { @@ -660,6 +661,14 @@ struct ceph_mds_caps { __le32 time_warp_seq; } __attribute__ ((packed)); +struct ceph_mds_cap_peer { + __le64 cap_id; + __le32 seq; + __le32 mseq; + __le32 mds; + __u8 flags; +} __attribute__ ((packed)); + /* cap release msg head */ struct ceph_mds_cap_release { __le32 num; /* number of cap_items that follow */ -- cgit v1.2.3 From 80213a84a96c3040f5824bce646a184d5dd3dd2b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 21 Jan 2014 11:07:16 +0800 Subject: libceph: support CEPH_FEATURE_EXPORT_PEER Signed-off-by: Yan, Zheng --- include/linux/ceph/ceph_features.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 5f42e4412909..d21b34d94436 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -80,7 +80,8 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ CEPH_FEATURE_OSDHASHPSPOOL | \ - CEPH_FEATURE_CRUSH_V2) + CEPH_FEATURE_CRUSH_V2 | \ + CEPH_FEATURE_EXPORT_PEER) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ -- cgit v1.2.3 From eeb0bed5572b1282009dfc2635604df5a35d1a02 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 9 Jan 2014 20:08:21 +0200 Subject: libceph: add ceph_kv{malloc,free}() and switch to them Encapsulate kmalloc vs vmalloc memory allocation and freeing logic into two helpers, ceph_kvmalloc() and ceph_kvfree(), and switch to them. ceph_kvmalloc() kmalloc()'s a maximum of 8 pages, anything bigger is vmalloc()'ed with __GFP_HIGHMEM set. This changes the existing behaviour: - for buffers (ceph_buffer_new()), from trying to kmalloc() everything and using vmalloc() just as a fallback - for messages (ceph_msg_new()), from going to vmalloc() for anything bigger than a page - for messages (ceph_msg_new()), from disallowing vmalloc() to use high memory Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/buffer.h | 1 - include/linux/ceph/libceph.h | 11 +++++++---- include/linux/ceph/messenger.h | 1 - net/ceph/buffer.c | 22 ++++++---------------- net/ceph/ceph_common.c | 20 ++++++++++++++++++++ net/ceph/messenger.c | 13 ++----------- 6 files changed, 35 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/buffer.h b/include/linux/ceph/buffer.h index 58d19014068f..07ad423cc37f 100644 --- a/include/linux/ceph/buffer.h +++ b/include/linux/ceph/buffer.h @@ -17,7 +17,6 @@ struct ceph_buffer { struct kref kref; struct kvec vec; size_t alloc_len; - bool is_vmalloc; }; extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 7d704db60cbb..2f49aa4c4f7f 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -173,15 +173,18 @@ static inline int calc_pages_for(u64 off, u64 len) (off >> PAGE_CACHE_SHIFT); } +extern struct kmem_cache *ceph_inode_cachep; +extern struct kmem_cache *ceph_cap_cachep; +extern struct kmem_cache *ceph_dentry_cachep; +extern struct kmem_cache *ceph_file_cachep; + /* ceph_common.c */ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -extern struct kmem_cache *ceph_inode_cachep; -extern struct kmem_cache *ceph_cap_cachep; -extern struct kmem_cache *ceph_dentry_cachep; -extern struct kmem_cache *ceph_file_cachep; +extern void *ceph_kvmalloc(size_t size, gfp_t flags); +extern void ceph_kvfree(const void *ptr); extern struct ceph_options *ceph_parse_options(char *options, const char *dev_name, const char *dev_name_end, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 861138f7c161..20ee8b63a968 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -154,7 +154,6 @@ struct ceph_msg { struct list_head list_head; /* links for connection lists */ struct kref kref; - bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; int front_alloc_len; diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index bf3e6a13c215..621b5f65407f 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -6,6 +6,7 @@ #include #include +#include /* for ceph_kv{malloc,free} */ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -15,16 +16,10 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (!b) return NULL; - b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN); - if (b->vec.iov_base) { - b->is_vmalloc = false; - } else { - b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL); - if (!b->vec.iov_base) { - kfree(b); - return NULL; - } - b->is_vmalloc = true; + b->vec.iov_base = ceph_kvmalloc(len, gfp); + if (!b->vec.iov_base) { + kfree(b); + return NULL; } kref_init(&b->kref); @@ -40,12 +35,7 @@ void ceph_buffer_release(struct kref *kref) struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); dout("buffer_release %p\n", b); - if (b->vec.iov_base) { - if (b->is_vmalloc) - vfree(b->vec.iov_base); - else - kfree(b->vec.iov_base); - } + ceph_kvfree(b->vec.iov_base); kfree(b); } EXPORT_SYMBOL(ceph_buffer_release); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 43d8177a52e1..67d7721d237e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -170,6 +171,25 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); +void *ceph_kvmalloc(size_t size, gfp_t flags) +{ + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + void *ptr = kmalloc(size, flags | __GFP_NOWARN); + if (ptr) + return ptr; + } + + return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); +} + +void ceph_kvfree(const void *ptr) +{ + if (is_vmalloc_addr(ptr)) + vfree(ptr); + else + kfree(ptr); +} + static int parse_fsid(const char *str, struct ceph_fsid *fsid) { diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 252ad4e01cf8..2ed1304d22a7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -3131,13 +3131,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, /* front */ if (front_len) { - if (front_len > PAGE_CACHE_SIZE) { - m->front.iov_base = __vmalloc(front_len, flags, - PAGE_KERNEL); - m->front_is_vmalloc = true; - } else { - m->front.iov_base = kmalloc(front_len, flags); - } + m->front.iov_base = ceph_kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); @@ -3259,10 +3253,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) void ceph_msg_kfree(struct ceph_msg *m) { dout("msg_kfree %p\n", m); - if (m->front_is_vmalloc) - vfree(m->front.iov_base); - else - kfree(m->front.iov_base); + ceph_kvfree(m->front.iov_base); kmem_cache_free(ceph_msg_cache, m); } -- cgit v1.2.3 From 22116525baec1d63f4878eaa92f0b57946a78819 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: start using oloc abstraction Instead of relying on pool fields in ceph_file_layout (for mapping) and ceph_pg (for enconding), start using ceph_object_locator (oloc) abstraction. Note that userspace oloc currently consists of pool, key, nspace and hash fields, while this one contains only a pool. This is OK, because at this point we only send (i.e. encode) olocs and never have to receive (i.e. decode) them. This makes keeping a copy of ceph_file_layout in every osd request unnecessary, so ceph_osd_request::r_file_layout field is nuked. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- drivers/block/rbd.c | 8 ++++---- include/linux/ceph/osd_client.h | 3 ++- include/linux/ceph/osdmap.h | 3 +-- net/ceph/osd_client.c | 8 +++++--- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 72a7eec456a9..6614e8d95525 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1808,12 +1808,12 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; + osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + osd_req->r_oid_len = strlen(obj_request->object_name); rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - osd_req->r_file_layout = rbd_dev->layout; /* struct */ - return osd_req; } @@ -1849,12 +1849,12 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; + osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + osd_req->r_oid_len = strlen(obj_request->object_name); rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); - osd_req->r_file_layout = rbd_dev->layout; /* struct */ - return osd_req; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4fb6a8938957..5b8555109789 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -159,12 +159,13 @@ struct ceph_osd_request { struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ + struct ceph_object_locator r_oloc; + char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ int r_oid_len; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ - struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ }; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index d05cc4451af6..256134af4ad4 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -40,8 +40,7 @@ struct ceph_pg_pool_info { }; struct ceph_object_locator { - uint64_t pool; - char *key; + s64 pool; }; struct ceph_pg_mapping { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 959d332ed534..7130c5c83d79 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -368,6 +368,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); + req->r_oloc.pool = -1; + /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -761,7 +763,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (num_ops > 1) osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); - req->r_file_layout = *layout; /* keep a copy */ + req->r_oloc.pool = ceph_file_layout_pg_pool(*layout); snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, objnum); @@ -1268,7 +1270,7 @@ static int __map_request(struct ceph_osd_client *osdc, dout("map_request %p tid %lld\n", req, req->r_tid); err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, - ceph_file_layout_pg_pool(req->r_file_layout)); + req->r_oloc.pool); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; @@ -1354,7 +1356,7 @@ static void __send_request(struct ceph_osd_client *osdc, /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); + put_unaligned_le64(req->r_oloc.pool, req->r_request_pool); p = req->r_request_pgid; ceph_encode_64(&p, req->r_pgid.pool); ceph_encode_32(&p, req->r_pgid.seed); -- cgit v1.2.3 From e8221464fc2bc8c9f7b0c2115abbd75ba23f210a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: move ceph_file_layout helpers to ceph_fs.h Move ceph_file_layout helper macros and inline functions to ceph_fs.h. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_fs.h | 23 +++++++++++++++++++++++ include/linux/ceph/osdmap.h | 27 --------------------------- 2 files changed, 23 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 0a37b989b52f..2623cffc73a1 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -53,6 +53,29 @@ struct ceph_file_layout { __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ } __attribute__ ((packed)); +#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) +#define ceph_file_layout_stripe_count(l) \ + ((__s32)le32_to_cpu((l).fl_stripe_count)) +#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) +#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) +#define ceph_file_layout_object_su(l) \ + ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) +#define ceph_file_layout_pg_pool(l) \ + ((__s32)le32_to_cpu((l).fl_pg_pool)) + +static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) +{ + return le32_to_cpu(l->fl_stripe_unit) * + le32_to_cpu(l->fl_stripe_count); +} + +/* "period" == bytes before i start on a new set of objects */ +static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) +{ + return le32_to_cpu(l->fl_object_size) * + le32_to_cpu(l->fl_stripe_count); +} + #define CEPH_MIN_STRIPE_UNIT 65536 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 256134af4ad4..f2679c384625 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -72,33 +72,6 @@ struct ceph_osdmap { struct crush_map *crush; }; -/* - * file layout helpers - */ -#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit)) -#define ceph_file_layout_stripe_count(l) \ - ((__s32)le32_to_cpu((l).fl_stripe_count)) -#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size)) -#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash)) -#define ceph_file_layout_object_su(l) \ - ((__s32)le32_to_cpu((l).fl_object_stripe_unit)) -#define ceph_file_layout_pg_pool(l) \ - ((__s32)le32_to_cpu((l).fl_pg_pool)) - -static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_stripe_unit) * - le32_to_cpu(l->fl_stripe_count); -} - -/* "period" == bytes before i start on a new set of objects */ -static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l) -{ - return le32_to_cpu(l->fl_object_size) * - le32_to_cpu(l->fl_stripe_count); -} - - static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) { return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); -- cgit v1.2.3 From 2d0ebc5d591f49131bf8f93b54c5424162c3fb7f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: rename MAX_OBJ_NAME_SIZE to CEPH_MAX_OID_NAME_LEN In preparation for adding oid abstraction, rename MAX_OBJ_NAME_SIZE to CEPH_MAX_OID_NAME_LEN. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- drivers/block/rbd.c | 6 +++--- include/linux/ceph/osd_client.h | 4 ++-- net/ceph/osd_client.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6614e8d95525..98792b2a7f65 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1088,9 +1088,9 @@ static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) name_format = "%s.%012llx"; if (rbd_dev->image_format == 2) name_format = "%s.%016llx"; - ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, name_format, + ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, rbd_dev->header.object_prefix, segment); - if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) { + if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { pr_err("error formatting segment name for #%llu (%d)\n", segment, ret); kfree(name); @@ -5350,7 +5350,7 @@ static int rbd_slab_init(void) rbd_assert(!rbd_segment_name_cache); rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", - MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL); + CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); if (rbd_segment_name_cache) return 0; out_err: diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 5b8555109789..b42f15848cae 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -16,7 +16,7 @@ * Maximum object name size * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) */ -#define MAX_OBJ_NAME_SIZE 100 +#define CEPH_MAX_OID_NAME_LEN 100 struct ceph_msg; struct ceph_snap_context; @@ -161,7 +161,7 @@ struct ceph_osd_request { struct ceph_object_locator r_oloc; - char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ + char r_oid[CEPH_MAX_OID_NAME_LEN]; /* object name */ int r_oid_len; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7130c5c83d79..a053e7e4a780 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -338,7 +338,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, msg_size = 4 + 4 + 8 + 8 + 4+8; msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pg_t */ - msg_size += 4 + MAX_OBJ_NAME_SIZE; + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ -- cgit v1.2.3 From 4295f2217a5aa8ef2738e3a368db3c1ceab41212 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:18 +0200 Subject: libceph: introduce and start using oid abstraction In preparation for tiering support, which would require having two (base and target) object names for each osd request and also copying those names around, introduce struct ceph_object_id (oid) and a couple helpers to facilitate those copies and encapsulate the fact that object name is not necessarily a NUL-terminated string. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- drivers/block/rbd.c | 10 ++-------- include/linux/ceph/osd_client.h | 9 +-------- include/linux/ceph/osdmap.h | 36 ++++++++++++++++++++++++++++++++++++ net/ceph/debugfs.c | 3 ++- net/ceph/osd_client.c | 17 +++++++++-------- 5 files changed, 50 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 98792b2a7f65..6fdc5fc00447 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1809,10 +1809,7 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_priv = obj_request; osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); + ceph_oid_set_name(&osd_req->r_oid, obj_request->object_name); return osd_req; } @@ -1850,10 +1847,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_priv = obj_request; osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - - osd_req->r_oid_len = strlen(obj_request->object_name); - rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); - memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); + ceph_oid_set_name(&osd_req->r_oid, obj_request->object_name); return osd_req; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index b42f15848cae..8d8bb53994b1 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -12,12 +12,6 @@ #include #include -/* - * Maximum object name size - * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) - */ -#define CEPH_MAX_OID_NAME_LEN 100 - struct ceph_msg; struct ceph_snap_context; struct ceph_osd_request; @@ -160,9 +154,8 @@ struct ceph_osd_request { void *r_priv; /* ditto */ struct ceph_object_locator r_oloc; + struct ceph_object_id r_oid; - char r_oid[CEPH_MAX_OID_NAME_LEN]; /* object name */ - int r_oid_len; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index f2679c384625..c85f7d43b861 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -43,6 +43,18 @@ struct ceph_object_locator { s64 pool; }; +/* + * Maximum supported by kernel client object name length + * + * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100) + */ +#define CEPH_MAX_OID_NAME_LEN 100 + +struct ceph_object_id { + char name[CEPH_MAX_OID_NAME_LEN]; + int name_len; +}; + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -72,6 +84,30 @@ struct ceph_osdmap { struct crush_map *crush; }; +static inline void ceph_oid_set_name(struct ceph_object_id *oid, + const char *name) +{ + int len; + + len = strlen(name); + if (len > sizeof(oid->name)) { + WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n", + name, len, sizeof(oid->name)); + len = sizeof(oid->name); + } + + memcpy(oid->name, name, len); + oid->name_len = len; +} + +static inline void ceph_oid_copy(struct ceph_object_id *dest, + struct ceph_object_id *src) +{ + BUG_ON(src->name_len > sizeof(dest->name)); + memcpy(dest->name, src->name, src->name_len); + dest->name_len = src->name_len; +} + static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) { return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 83661cdc0766..1f8562706393 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -132,7 +132,8 @@ static int osdc_show(struct seq_file *s, void *pp) req->r_osd ? req->r_osd->o_osd : -1, req->r_pgid.pool, req->r_pgid.seed); - seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); + seq_printf(s, "%.*s", req->r_oid.name_len, + req->r_oid.name); if (req->r_reassert_version.epoch) seq_printf(s, "\t%u'%llu", diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a053e7e4a780..2988d68b24c6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -765,9 +765,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_oloc.pool = ceph_file_layout_pg_pool(*layout); - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", - vino.ino, objnum); - req->r_oid_len = strlen(req->r_oid); + snprintf(req->r_oid.name, sizeof(req->r_oid.name), + "%llx.%08llx", vino.ino, objnum); + req->r_oid.name_len = strlen(req->r_oid.name); return req; } @@ -1269,7 +1269,7 @@ static int __map_request(struct ceph_osd_client *osdc, bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, + err = ceph_calc_ceph_pg(&pgid, req->r_oid.name, osdc->osdmap, req->r_oloc.pool); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); @@ -2118,10 +2118,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, ceph_encode_32(&p, -1); /* preferred */ /* oid */ - ceph_encode_32(&p, req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); - p += req->r_oid_len; + ceph_encode_32(&p, req->r_oid.name_len); + memcpy(p, req->r_oid.name, req->r_oid.name_len); + dout("oid '%.*s' len %d\n", req->r_oid.name_len, + req->r_oid.name, req->r_oid.name_len); + p += req->r_oid.name_len; /* ops--can imply data */ ceph_encode_16(&p, (u16)req->r_num_ops); -- cgit v1.2.3 From 7c13cb64352230deac24d3cb058387a6c0676f83 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: replace ceph_calc_ceph_pg() with ceph_oloc_oid_to_pg() Switch ceph_calc_ceph_pg() to new oloc and oid abstractions and rename it to ceph_oloc_oid_to_pg() to make its purpose more clear. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- fs/ceph/ioctl.c | 8 ++++++-- include/linux/ceph/osdmap.h | 7 +++++-- net/ceph/osd_client.c | 4 ++-- net/ceph/osdmap.c | 29 +++++++++++++++++------------ 4 files changed, 30 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 669622fd1ae3..dc66c9e023e4 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -183,6 +183,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_object_locator oloc; + struct ceph_object_id oid; u64 len = 1, olen; u64 tmp; struct ceph_pg pgid; @@ -211,8 +213,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, - ceph_file_layout_pg_pool(ci->i_layout)); + oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); + ceph_oid_set_name(&oid, dl.object_name); + + r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); if (r < 0) { up_read(&osdc->map_sem); return r; diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index c85f7d43b861..ebb8ec285de6 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -163,8 +163,11 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ -extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, - struct ceph_osdmap *osdmap, uint64_t pool); +extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_locator *oloc, + struct ceph_object_id *oid, + struct ceph_pg *pg_out); + extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, int *acting); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2988d68b24c6..10360dedcdad 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1269,8 +1269,8 @@ static int __map_request(struct ceph_osd_client *osdc, bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_ceph_pg(&pgid, req->r_oid.name, osdc->osdmap, - req->r_oloc.pool); + err = ceph_oloc_oid_to_pg(osdc->osdmap, &req->r_oloc, &req->r_oid, + &pgid); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 8b1a6b48bb5d..768dd04eb9b1 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1090,25 +1090,30 @@ invalid: EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* - * calculate an object layout (i.e. pgid) from an oid, - * file_layout, and osdmap + * Calculate mapping of a (oloc, oid) pair to a PG. Should only be + * called with target's (oloc, oid), since tiering isn't taken into + * account. */ -int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, - struct ceph_osdmap *osdmap, uint64_t pool) +int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_locator *oloc, + struct ceph_object_id *oid, + struct ceph_pg *pg_out) { - struct ceph_pg_pool_info *pool_info; + struct ceph_pg_pool_info *pi; - BUG_ON(!osdmap); - pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); - if (!pool_info) + pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); + if (!pi) return -EIO; - pg->pool = pool; - pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); - dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); + pg_out->pool = oloc->pool; + pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); + + dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, + pg_out->pool, pg_out->seed); return 0; } -EXPORT_SYMBOL(ceph_calc_ceph_pg); +EXPORT_SYMBOL(ceph_oloc_oid_to_pg); static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, int *result, int result_max, -- cgit v1.2.3 From 1b3f2ab51095a7aab684bf9f5c14235126188dbc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: CEPH_OSD_FLAG_* enum update Update CEPH_OSD_FLAG_* enum. (We need CEPH_OSD_FLAG_IGNORE_OVERLAY to support tiering). Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/rados.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 68c96a508ac2..96292df4041b 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -344,6 +344,10 @@ enum { CEPH_OSD_FLAG_EXEC_PUBLIC = 0x1000, /* DEPRECATED op may exec (public) */ CEPH_OSD_FLAG_LOCALIZE_READS = 0x2000, /* read from nearby replica, if any */ CEPH_OSD_FLAG_RWORDERED = 0x4000, /* order wrt concurrent reads */ + CEPH_OSD_FLAG_IGNORE_CACHE = 0x8000, /* ignore cache logic */ + CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ + CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ + CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ }; enum { -- cgit v1.2.3 From ce7f6a2790464047199f54b66420243d433142bd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: add ceph_pg_pool_by_id() "Lookup pool info by ID" function is hidden in osdmap.c. Expose it to the rest of libceph. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osdmap.h | 3 +++ net/ceph/osdmap.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index ebb8ec285de6..7f894a64c6c7 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -174,6 +174,9 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid); +extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, + u64 id); + extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id); extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 768dd04eb9b1..d69d23556f0c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -464,6 +464,11 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) return NULL; } +struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) +{ + return __lookup_pg_pool(&map->pg_pools, id); +} + const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; -- cgit v1.2.3 From 17a13e4028e6ad7ded079cf32370c47bd0e0fc07 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:19 +0200 Subject: libceph: follow {read,write}_tier fields on osd request submission Overwrite ceph_osd_request::r_oloc.pool with read_tier for read ops and write_tier for write and read+write ops (aka basic tiering support). {read,write}_tier are part of pg_pool_t since v9. This commit bumps our pg_pool_t decode compat version from v7 to v9, all new fields except for {read,write}_tier are ignored. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osdmap.h | 2 ++ net/ceph/osd_client.c | 30 ++++++++++++++++++++++++++++-- net/ceph/osdmap.c | 28 +++++++++++++++++++++++++--- 3 files changed, 55 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 7f894a64c6c7..49ff69f0746b 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -35,6 +35,8 @@ struct ceph_pg_pool_info { u8 object_hash; u32 pg_num, pgp_num; int pg_num_mask, pgp_num_mask; + s64 read_tier; + s64 write_tier; /* wins for read+write ops */ u64 flags; char *name; }; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 10360dedcdad..0eb009eaf4ff 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1249,6 +1249,32 @@ static bool __req_should_be_paused(struct ceph_osd_client *osdc, (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); } +/* + * Calculate mapping of a request to a PG. Takes tiering into account. + */ +static int __calc_request_pg(struct ceph_osdmap *osdmap, + struct ceph_osd_request *req, + struct ceph_pg *pg_out) +{ + if ((req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdmap, req->r_oloc.pool); + if (pi) { + if ((req->r_flags & CEPH_OSD_FLAG_READ) && + pi->read_tier >= 0) + req->r_oloc.pool = pi->read_tier; + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + pi->write_tier >= 0) + req->r_oloc.pool = pi->write_tier; + } + /* !pi is caught in ceph_oloc_oid_to_pg() */ + } + + return ceph_oloc_oid_to_pg(osdmap, &req->r_oloc, + &req->r_oid, pg_out); +} + /* * Pick an osd (the first 'up' osd in the pg), allocate the osd struct * (as needed), and set the request r_osd appropriately. If there is @@ -1269,8 +1295,8 @@ static int __map_request(struct ceph_osd_client *osdc, bool was_paused; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_oloc_oid_to_pg(osdc->osdmap, &req->r_oloc, &req->r_oid, - &pgid); + + err = __calc_request_pg(osdc->osdmap, req, &pgid); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d69d23556f0c..aade4a5c1c07 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -519,8 +519,8 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); return -EINVAL; } - if (cv > 7) { - pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv); + if (cv > 9) { + pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); return -EINVAL; } len = ceph_decode_32(p); @@ -548,12 +548,34 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) *p += len; } - /* skip removed snaps */ + /* skip removed_snaps */ num = ceph_decode_32(p); *p += num * (8 + 8); *p += 8; /* skip auid */ pi->flags = ceph_decode_64(p); + *p += 4; /* skip crash_replay_interval */ + + if (ev >= 7) + *p += 1; /* skip min_size */ + + if (ev >= 8) + *p += 8 + 8; /* skip quota_max_* */ + + if (ev >= 9) { + /* skip tiers */ + num = ceph_decode_32(p); + *p += num * 8; + + *p += 8; /* skip tier_of */ + *p += 1; /* skip cache_mode */ + + pi->read_tier = ceph_decode_64(p); + pi->write_tier = ceph_decode_64(p); + } else { + pi->read_tier = -1; + pi->write_tier = -1; + } /* ignore the rest */ -- cgit v1.2.3 From 3c972c95c68f455d80ff185aa440857be046bbe0 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:20 +0200 Subject: libceph: rename ceph_osd_request::r_{oloc,oid} to r_base_{oloc,oid} Rename ceph_osd_request::r_{oloc,oid} to r_base_{oloc,oid} before introducing r_target_{oloc,oid} needed for redirects. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- drivers/block/rbd.c | 8 ++++---- include/linux/ceph/osd_client.h | 4 ++-- net/ceph/debugfs.c | 4 ++-- net/ceph/osd_client.c | 30 +++++++++++++++--------------- 4 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6fdc5fc00447..16cab6635163 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1808,8 +1808,8 @@ static struct ceph_osd_request *rbd_osd_req_create( osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_oid, obj_request->object_name); + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } @@ -1846,8 +1846,8 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) osd_req->r_callback = rbd_osd_req_callback; osd_req->r_priv = obj_request; - osd_req->r_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); - ceph_oid_set_name(&osd_req->r_oid, obj_request->object_name); + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); return osd_req; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 8d8bb53994b1..3170ca6d98b2 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -153,8 +153,8 @@ struct ceph_osd_request { struct inode *r_inode; /* for use by callbacks */ void *r_priv; /* ditto */ - struct ceph_object_locator r_oloc; - struct ceph_object_id r_oid; + struct ceph_object_locator r_base_oloc; + struct ceph_object_id r_base_oid; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 1f8562706393..258a382e75ed 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -132,8 +132,8 @@ static int osdc_show(struct seq_file *s, void *pp) req->r_osd ? req->r_osd->o_osd : -1, req->r_pgid.pool, req->r_pgid.seed); - seq_printf(s, "%.*s", req->r_oid.name_len, - req->r_oid.name); + seq_printf(s, "%.*s", req->r_base_oid.name_len, + req->r_base_oid.name); if (req->r_reassert_version.epoch) seq_printf(s, "\t%u'%llu", diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0eb009eaf4ff..3997a87c4f51 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -368,7 +368,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); - req->r_oloc.pool = -1; + req->r_base_oloc.pool = -1; /* create reply message */ if (use_mempool) @@ -763,11 +763,11 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (num_ops > 1) osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); - req->r_oloc.pool = ceph_file_layout_pg_pool(*layout); + req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); - snprintf(req->r_oid.name, sizeof(req->r_oid.name), + snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), "%llx.%08llx", vino.ino, objnum); - req->r_oid.name_len = strlen(req->r_oid.name); + req->r_base_oid.name_len = strlen(req->r_base_oid.name); return req; } @@ -1259,20 +1259,20 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, if ((req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { struct ceph_pg_pool_info *pi; - pi = ceph_pg_pool_by_id(osdmap, req->r_oloc.pool); + pi = ceph_pg_pool_by_id(osdmap, req->r_base_oloc.pool); if (pi) { if ((req->r_flags & CEPH_OSD_FLAG_READ) && pi->read_tier >= 0) - req->r_oloc.pool = pi->read_tier; + req->r_base_oloc.pool = pi->read_tier; if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && pi->write_tier >= 0) - req->r_oloc.pool = pi->write_tier; + req->r_base_oloc.pool = pi->write_tier; } /* !pi is caught in ceph_oloc_oid_to_pg() */ } - return ceph_oloc_oid_to_pg(osdmap, &req->r_oloc, - &req->r_oid, pg_out); + return ceph_oloc_oid_to_pg(osdmap, &req->r_base_oloc, + &req->r_base_oid, pg_out); } /* @@ -1382,7 +1382,7 @@ static void __send_request(struct ceph_osd_client *osdc, /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_oloc.pool, req->r_request_pool); + put_unaligned_le64(req->r_base_oloc.pool, req->r_request_pool); p = req->r_request_pgid; ceph_encode_64(&p, req->r_pgid.pool); ceph_encode_32(&p, req->r_pgid.seed); @@ -2144,11 +2144,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, ceph_encode_32(&p, -1); /* preferred */ /* oid */ - ceph_encode_32(&p, req->r_oid.name_len); - memcpy(p, req->r_oid.name, req->r_oid.name_len); - dout("oid '%.*s' len %d\n", req->r_oid.name_len, - req->r_oid.name, req->r_oid.name_len); - p += req->r_oid.name_len; + ceph_encode_32(&p, req->r_base_oid.name_len); + memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); + dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, + req->r_base_oid.name, req->r_base_oid.name_len); + p += req->r_base_oid.name_len; /* ops--can imply data */ ceph_encode_16(&p, (u16)req->r_num_ops); -- cgit v1.2.3 From 205ee1187a671c3b067d7f1e974903b44036f270 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:20 +0200 Subject: libceph: follow redirect replies from osds Follow redirect replies from osds, for details see ceph.git commit fbbe3ad1220799b7bb00ea30fce581c5eadaf034. v1 (current) version of redirect reply consists of oloc and oid, which expands to pool, key, nspace, hash and oid. However, server-side code that would populate anything other than pool doesn't exist yet, and hence this commit adds support for pool redirects only. To make sure that future server-side updates don't break us, we decode all fields and, if any of key, nspace, hash or oid have a non-default value, error out with "corrupt osd_op_reply ..." message. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 6 ++ net/ceph/osd_client.c | 167 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 164 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3170ca6d98b2..fd47e872ebcc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -155,6 +155,8 @@ struct ceph_osd_request { struct ceph_object_locator r_base_oloc; struct ceph_object_id r_base_oid; + struct ceph_object_locator r_target_oloc; + struct ceph_object_id r_target_oid; u64 r_snapid; unsigned long r_stamp; /* send OR check time */ @@ -162,6 +164,10 @@ struct ceph_osd_request { struct ceph_snap_context *r_snapc; /* snap context for writes */ }; +struct ceph_request_redirect { + struct ceph_object_locator oloc; +}; + struct ceph_osd_event { u64 cookie; int one_shot; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3997a87c4f51..010ff3bd58ad 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -369,6 +369,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_osd_item); req->r_base_oloc.pool = -1; + req->r_target_oloc.pool = -1; /* create reply message */ if (use_mempool) @@ -1256,23 +1257,36 @@ static int __calc_request_pg(struct ceph_osdmap *osdmap, struct ceph_osd_request *req, struct ceph_pg *pg_out) { - if ((req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + bool need_check_tiering; + + need_check_tiering = false; + if (req->r_target_oloc.pool == -1) { + req->r_target_oloc = req->r_base_oloc; /* struct */ + need_check_tiering = true; + } + if (req->r_target_oid.name_len == 0) { + ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); + need_check_tiering = true; + } + + if (need_check_tiering && + (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { struct ceph_pg_pool_info *pi; - pi = ceph_pg_pool_by_id(osdmap, req->r_base_oloc.pool); + pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); if (pi) { if ((req->r_flags & CEPH_OSD_FLAG_READ) && pi->read_tier >= 0) - req->r_base_oloc.pool = pi->read_tier; + req->r_target_oloc.pool = pi->read_tier; if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && pi->write_tier >= 0) - req->r_base_oloc.pool = pi->write_tier; + req->r_target_oloc.pool = pi->write_tier; } /* !pi is caught in ceph_oloc_oid_to_pg() */ } - return ceph_oloc_oid_to_pg(osdmap, &req->r_base_oloc, - &req->r_base_oid, pg_out); + return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, + &req->r_target_oid, pg_out); } /* @@ -1382,7 +1396,7 @@ static void __send_request(struct ceph_osd_client *osdc, /* fill in message content that changes each time we send it */ put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_base_oloc.pool, req->r_request_pool); + put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); p = req->r_request_pgid; ceph_encode_64(&p, req->r_pgid.pool); ceph_encode_32(&p, req->r_pgid.seed); @@ -1483,6 +1497,109 @@ static void handle_osds_timeout(struct work_struct *work) round_jiffies_relative(delay)); } +static int ceph_oloc_decode(void **p, void *end, + struct ceph_object_locator *oloc) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret = 0; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_v < 3) { + pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + if (struct_cv > 6) { + pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + oloc->pool = ceph_decode_64(p); + *p += 4; /* skip preferred */ + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::key is set\n"); + goto e_inval; + } + + if (struct_v >= 5) { + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::nspace is set\n"); + goto e_inval; + } + } + + if (struct_v >= 6) { + s64 hash = ceph_decode_64(p); + if (hash != -1) { + pr_warn("ceph_object_locator::hash is set\n"); + goto e_inval; + } + } + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + +static int ceph_redirect_decode(void **p, void *end, + struct ceph_request_redirect *redir) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_cv > 1) { + pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + ret = ceph_oloc_decode(p, end, &redir->oloc); + if (ret) + goto out; + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_request_redirect::object_name is set\n"); + goto e_inval; + } + + len = ceph_decode_32(p); + *p += len; /* skip osd_instructions */ + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + static void complete_request(struct ceph_osd_request *req) { complete_all(&req->r_safe_completion); /* fsync waiter */ @@ -1497,6 +1614,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, { void *p, *end; struct ceph_osd_request *req; + struct ceph_request_redirect redir; u64 tid; int object_len; unsigned int numops; @@ -1576,10 +1694,41 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, for (i = 0; i < numops; i++) req->r_reply_op_result[i] = ceph_decode_32(&p); - already_completed = req->r_got_reply; + if (le16_to_cpu(msg->hdr.version) >= 6) { + p += 8 + 4; /* skip replay_version */ + p += 8; /* skip user_version */ - if (!req->r_got_reply) { + err = ceph_redirect_decode(&p, end, &redir); + if (err) + goto bad_put; + } else { + redir.oloc.pool = -1; + } + if (redir.oloc.pool != -1) { + dout("redirect pool %lld\n", redir.oloc.pool); + + __unregister_request(osdc, req); + mutex_unlock(&osdc->request_mutex); + + req->r_target_oloc = redir.oloc; /* struct */ + + /* + * Start redirect requests with nofail=true. If + * mapping fails, request will end up on the notarget + * list, waiting for the new osdmap (which can take + * a while), even though the original request mapped + * successfully. In the future we might want to follow + * original request's nofail setting here. + */ + err = ceph_osdc_start_request(osdc, req, true); + BUG_ON(err); + + goto done; + } + + already_completed = req->r_got_reply; + if (!req->r_got_reply) { req->r_result = result; dout("handle_reply result %d bytes %d\n", req->r_result, bytes); -- cgit v1.2.3 From 80e163a58c0c69ef1a0ba3500d9932b14d67bf64 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 27 Jan 2014 17:40:20 +0200 Subject: libceph: support CEPH_FEATURE_OSD_CACHEPOOL feature Announce our (limited, see previous commit) support for CACHEPOOL feature. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index d21b34d94436..138448f766b4 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -80,6 +80,7 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ CEPH_FEATURE_OSDHASHPSPOOL | \ + CEPH_FEATURE_OSD_CACHEPOOL | \ CEPH_FEATURE_CRUSH_V2 | \ CEPH_FEATURE_EXPORT_PEER) -- cgit v1.2.3