1#ifndef CEPH_RADOS_H 2#define CEPH_RADOS_H 3 4/* 5 * Data types for the Ceph distributed object storage layer RADOS 6 * (Reliable Autonomic Distributed Object Store). 7 */ 8 9#include "msgr.h" 10 11/* 12 * osdmap encoding versions 13 */ 14#define CEPH_OSDMAP_INC_VERSION 5 15#define CEPH_OSDMAP_INC_VERSION_EXT 5 16#define CEPH_OSDMAP_VERSION 5 17#define CEPH_OSDMAP_VERSION_EXT 5 18 19/* 20 * fs id 21 */ 22struct ceph_fsid { 23 unsigned char fsid[16]; 24}; 25 26static inline int ceph_fsid_compare(const struct ceph_fsid *a, 27 const struct ceph_fsid *b) 28{ 29 return memcmp(a, b, sizeof(*a)); 30} 31 32/* 33 * ino, object, etc. 34 */ 35typedef __le64 ceph_snapid_t; 36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */ 37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */ 38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */ 39 40struct ceph_timespec { 41 __le32 tv_sec; 42 __le32 tv_nsec; 43} __attribute__ ((packed)); 44 45 46/* 47 * object layout - how objects are mapped into PGs 48 */ 49#define CEPH_OBJECT_LAYOUT_HASH 1 50#define CEPH_OBJECT_LAYOUT_LINEAR 2 51#define CEPH_OBJECT_LAYOUT_HASHINO 3 52 53/* 54 * pg layout -- how PGs are mapped onto (sets of) OSDs 55 */ 56#define CEPH_PG_LAYOUT_CRUSH 0 57#define CEPH_PG_LAYOUT_HASH 1 58#define CEPH_PG_LAYOUT_LINEAR 2 59#define CEPH_PG_LAYOUT_HYBRID 3 60 61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ 62 63/* 64 * placement group. 65 * we encode this into one __le64. 66 */ 67struct ceph_pg { 68 __le16 preferred; /* preferred primary osd */ 69 __le16 ps; /* placement seed */ 70 __le32 pool; /* object pool */ 71} __attribute__ ((packed)); 72 73/* 74 * pg_pool is a set of pgs storing a pool of objects 75 * 76 * pg_num -- base number of pseudorandomly placed pgs 77 * 78 * pgp_num -- effective number when calculating pg placement. this 79 * is used for pg_num increases. new pgs result in data being "split" 80 * into new pgs. for this to proceed smoothly, new pgs are intiially 81 * colocated with their parents; that is, pgp_num doesn't increase 82 * until the new pgs have successfully split. only _then_ are the new 83 * pgs placed independently. 84 * 85 * lpg_num -- localized pg count (per device). replicas are randomly 86 * selected. 87 * 88 * lpgp_num -- as above. 89 */ 90#define CEPH_PG_TYPE_REP 1 91#define CEPH_PG_TYPE_RAID4 2 92#define CEPH_PG_POOL_VERSION 2 93struct ceph_pg_pool { 94 __u8 type; /* CEPH_PG_TYPE_* */ 95 __u8 size; /* number of osds in each pg */ 96 __u8 crush_ruleset; /* crush placement rule */ 97 __u8 object_hash; /* hash mapping object name to ps */ 98 __le32 pg_num, pgp_num; /* number of pg's */ 99 __le32 lpg_num, lpgp_num; /* number of localized pg's */ 100 __le32 last_change; /* most recent epoch changed */ 101 __le64 snap_seq; /* seq for per-pool snapshot */ 102 __le32 snap_epoch; /* epoch of last snap */ 103 __le32 num_snaps; 104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */ 105 __le64 auid; /* who owns the pg */ 106} __attribute__ ((packed)); 107 108/* 109 * stable_mod func is used to control number of placement groups. 110 * similar to straight-up modulo, but produces a stable mapping as b 111 * increases over time. b is the number of bins, and bmask is the 112 * containing power of 2 minus 1. 113 * 114 * b <= bmask and bmask=(2**n)-1 115 * e.g., b=12 -> bmask=15, b=123 -> bmask=127 116 */ 117static inline int ceph_stable_mod(int x, int b, int bmask) 118{ 119 if ((x & bmask) < b) 120 return x & bmask; 121 else 122 return x & (bmask >> 1); 123} 124 125/* 126 * object layout - how a given object should be stored. 127 */ 128struct ceph_object_layout { 129 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */ 130 __le32 ol_stripe_unit; /* for per-object parity, if any */ 131} __attribute__ ((packed)); 132 133/* 134 * compound epoch+version, used by storage layer to serialize mutations 135 */ 136struct ceph_eversion { 137 __le32 epoch; 138 __le64 version; 139} __attribute__ ((packed)); 140 141/* 142 * osd map bits 143 */ 144 145/* status bits */ 146#define CEPH_OSD_EXISTS 1 147#define CEPH_OSD_UP 2 148 149/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */ 150#define CEPH_OSD_IN 0x10000 151#define CEPH_OSD_OUT 0 152 153 154/* 155 * osd map flag bits 156 */ 157#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */ 158#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */ 159#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */ 160#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */ 161#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */ 162 163/* 164 * osd ops 165 */ 166#define CEPH_OSD_OP_MODE 0xf000 167#define CEPH_OSD_OP_MODE_RD 0x1000 168#define CEPH_OSD_OP_MODE_WR 0x2000 169#define CEPH_OSD_OP_MODE_RMW 0x3000 170#define CEPH_OSD_OP_MODE_SUB 0x4000 171 172#define CEPH_OSD_OP_TYPE 0x0f00 173#define CEPH_OSD_OP_TYPE_LOCK 0x0100 174#define CEPH_OSD_OP_TYPE_DATA 0x0200 175#define CEPH_OSD_OP_TYPE_ATTR 0x0300 176#define CEPH_OSD_OP_TYPE_EXEC 0x0400 177#define CEPH_OSD_OP_TYPE_PG 0x0500 178 179enum { 180 /** data **/ 181 /* read */ 182 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, 183 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, 184 185 /* fancy read */ 186 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, 187 188 /* write */ 189 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, 190 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2, 191 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3, 192 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4, 193 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5, 194 195 /* fancy write */ 196 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6, 197 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7, 198 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8, 199 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9, 200 201 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10, 202 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11, 203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, 204 205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, 206 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, 207 208 /** attrs **/ 209 /* read */ 210 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 211 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, 212 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3, 213 214 /* write */ 215 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, 216 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2, 217 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3, 218 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, 219 220 /** subop **/ 221 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, 222 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, 223 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, 224 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, 225 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, 226 227 /** lock **/ 228 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, 229 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2, 230 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3, 231 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4, 232 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5, 233 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6, 234 235 /** exec **/ 236 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1, 237 238 /** pg **/ 239 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1, 240}; 241 242static inline int ceph_osd_op_type_lock(int op) 243{ 244 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK; 245} 246static inline int ceph_osd_op_type_data(int op) 247{ 248 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA; 249} 250static inline int ceph_osd_op_type_attr(int op) 251{ 252 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR; 253} 254static inline int ceph_osd_op_type_exec(int op) 255{ 256 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC; 257} 258static inline int ceph_osd_op_type_pg(int op) 259{ 260 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG; 261} 262 263static inline int ceph_osd_op_mode_subop(int op) 264{ 265 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB; 266} 267static inline int ceph_osd_op_mode_read(int op) 268{ 269 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD; 270} 271static inline int ceph_osd_op_mode_modify(int op) 272{ 273 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; 274} 275 276/* 277 * note that the following tmap stuff is also defined in the ceph librados.h 278 * any modification here needs to be updated there 279 */ 280#define CEPH_OSD_TMAP_HDR 'h' 281#define CEPH_OSD_TMAP_SET 's' 282#define CEPH_OSD_TMAP_RM 'r' 283 284extern const char *ceph_osd_op_name(int op); 285 286 287/* 288 * osd op flags 289 * 290 * An op may be READ, WRITE, or READ|WRITE. 291 */ 292enum { 293 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */ 294 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */ 295 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */ 296 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */ 297 CEPH_OSD_FLAG_READ = 16, /* op may read */ 298 CEPH_OSD_FLAG_WRITE = 32, /* op may write */ 299 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */ 300 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */ 301 CEPH_OSD_FLAG_BALANCE_READS = 256, 302 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ 303 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ 304 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ 305 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */ 306}; 307 308enum { 309 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ 310}; 311 312#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 313#define EBLACKLISTED ESHUTDOWN /* blacklisted */ 314 315/* xattr comparison */ 316enum { 317 CEPH_OSD_CMPXATTR_OP_NOP = 0, 318 CEPH_OSD_CMPXATTR_OP_EQ = 1, 319 CEPH_OSD_CMPXATTR_OP_NE = 2, 320 CEPH_OSD_CMPXATTR_OP_GT = 3, 321 CEPH_OSD_CMPXATTR_OP_GTE = 4, 322 CEPH_OSD_CMPXATTR_OP_LT = 5, 323 CEPH_OSD_CMPXATTR_OP_LTE = 6 324}; 325 326enum { 327 CEPH_OSD_CMPXATTR_MODE_STRING = 1, 328 CEPH_OSD_CMPXATTR_MODE_U64 = 2 329}; 330 331/* 332 * an individual object operation. each may be accompanied by some data 333 * payload 334 */ 335struct ceph_osd_op { 336 __le16 op; /* CEPH_OSD_OP_* */ 337 __le32 flags; /* CEPH_OSD_FLAG_* */ 338 union { 339 struct { 340 __le64 offset, length; 341 __le64 truncate_size; 342 __le32 truncate_seq; 343 } __attribute__ ((packed)) extent; 344 struct { 345 __le32 name_len; 346 __le32 value_len; 347 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 348 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 349 } __attribute__ ((packed)) xattr; 350 struct { 351 __u8 class_len; 352 __u8 method_len; 353 __u8 argc; 354 __le32 indata_len; 355 } __attribute__ ((packed)) cls; 356 struct { 357 __le64 cookie, count; 358 } __attribute__ ((packed)) pgls; 359 struct { 360 __le64 snapid; 361 } __attribute__ ((packed)) snap; 362 }; 363 __le32 payload_len; 364} __attribute__ ((packed)); 365 366/* 367 * osd request message header. each request may include multiple 368 * ceph_osd_op object operations. 369 */ 370struct ceph_osd_request_head { 371 __le32 client_inc; /* client incarnation */ 372 struct ceph_object_layout layout; /* pgid */ 373 __le32 osdmap_epoch; /* client's osdmap epoch */ 374 375 __le32 flags; 376 377 struct ceph_timespec mtime; /* for mutations only */ 378 struct ceph_eversion reassert_version; /* if we are replaying op */ 379 380 __le32 object_len; /* length of object name */ 381 382 __le64 snapid; /* snapid to read */ 383 __le64 snap_seq; /* writer's snap context */ 384 __le32 num_snaps; 385 386 __le16 num_ops; 387 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ 388} __attribute__ ((packed)); 389 390struct ceph_osd_reply_head { 391 __le32 client_inc; /* client incarnation */ 392 __le32 flags; 393 struct ceph_object_layout layout; 394 __le32 osdmap_epoch; 395 struct ceph_eversion reassert_version; /* for replaying uncommitted */ 396 397 __le32 result; /* result code */ 398 399 __le32 object_len; /* length of object name */ 400 __le32 num_ops; 401 struct ceph_osd_op ops[0]; /* ops[], object */ 402} __attribute__ ((packed)); 403 404 405#endif 406