1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _FS_CEPH_OSD_CLIENT_H
3#define _FS_CEPH_OSD_CLIENT_H
4
5#include <linux/bitrev.h>
6#include <linux/completion.h>
7#include <linux/kref.h>
8#include <linux/mempool.h>
9#include <linux/rbtree.h>
10#include <linux/refcount.h>
11#include <linux/ktime.h>
12
13#include <linux/ceph/types.h>
14#include <linux/ceph/osdmap.h>
15#include <linux/ceph/messenger.h>
16#include <linux/ceph/msgpool.h>
17#include <linux/ceph/auth.h>
18#include <linux/ceph/pagelist.h>
19
20struct ceph_msg;
21struct ceph_snap_context;
22struct ceph_osd_request;
23struct ceph_osd_client;
24
25/*
26 * completion callback for async writepages
27 */
28typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
29
30#define CEPH_HOMELESS_OSD	-1
31
32/*
33 * A single extent in a SPARSE_READ reply.
34 *
35 * Note that these come from the OSD as little-endian values. On BE arches,
36 * we convert them in-place after receipt.
37 */
38struct ceph_sparse_extent {
39	u64	off;
40	u64	len;
41} __packed;
42
43/* Sparse read state machine state values */
44enum ceph_sparse_read_state {
45	CEPH_SPARSE_READ_HDR	= 0,
46	CEPH_SPARSE_READ_EXTENTS,
47	CEPH_SPARSE_READ_DATA_LEN,
48	CEPH_SPARSE_READ_DATA_PRE,
49	CEPH_SPARSE_READ_DATA,
50};
51
52/*
53 * A SPARSE_READ reply is a 32-bit count of extents, followed by an array of
54 * 64-bit offset/length pairs, and then all of the actual file data
55 * concatenated after it (sans holes).
56 *
57 * Unfortunately, we don't know how long the extent array is until we've
58 * started reading the data section of the reply. The caller should send down
59 * a destination buffer for the array, but we'll alloc one if it's too small
60 * or if the caller doesn't.
61 */
62struct ceph_sparse_read {
63	enum ceph_sparse_read_state	sr_state;    /* state machine state */
64	u64				sr_req_off;  /* orig request offset */
65	u64				sr_req_len;  /* orig request length */
66	u64				sr_pos;      /* current pos in buffer */
67	int				sr_index;    /* current extent index */
68	u32				sr_datalen;  /* length of actual data */
69	u32				sr_count;    /* extent count in reply */
70	int				sr_ext_len;  /* length of extent array */
71	struct ceph_sparse_extent	*sr_extent;  /* extent array */
72};
73
74/*
75 * A given osd we're communicating with.
76 *
77 * Note that the o_requests tree can be searched while holding the "lock" mutex
78 * or the "o_requests_lock" spinlock. Insertion or removal requires both!
79 */
80struct ceph_osd {
81	refcount_t o_ref;
82	int o_sparse_op_idx;
83	struct ceph_osd_client *o_osdc;
84	int o_osd;
85	int o_incarnation;
86	struct rb_node o_node;
87	struct ceph_connection o_con;
88	spinlock_t o_requests_lock;
89	struct rb_root o_requests;
90	struct rb_root o_linger_requests;
91	struct rb_root o_backoff_mappings;
92	struct rb_root o_backoffs_by_id;
93	struct list_head o_osd_lru;
94	struct ceph_auth_handshake o_auth;
95	unsigned long lru_ttl;
96	struct list_head o_keepalive_item;
97	struct mutex lock;
98	struct ceph_sparse_read	o_sparse_read;
99};
100
101#define CEPH_OSD_SLAB_OPS	2
102#define CEPH_OSD_MAX_OPS	16
103
104enum ceph_osd_data_type {
105	CEPH_OSD_DATA_TYPE_NONE = 0,
106	CEPH_OSD_DATA_TYPE_PAGES,
107	CEPH_OSD_DATA_TYPE_PAGELIST,
108#ifdef CONFIG_BLOCK
109	CEPH_OSD_DATA_TYPE_BIO,
110#endif /* CONFIG_BLOCK */
111	CEPH_OSD_DATA_TYPE_BVECS,
112	CEPH_OSD_DATA_TYPE_ITER,
113};
114
115struct ceph_osd_data {
116	enum ceph_osd_data_type	type;
117	union {
118		struct {
119			struct page	**pages;
120			u64		length;
121			u32		alignment;
122			bool		pages_from_pool;
123			bool		own_pages;
124		};
125		struct ceph_pagelist	*pagelist;
126#ifdef CONFIG_BLOCK
127		struct {
128			struct ceph_bio_iter	bio_pos;
129			u32			bio_length;
130		};
131#endif /* CONFIG_BLOCK */
132		struct {
133			struct ceph_bvec_iter	bvec_pos;
134			u32			num_bvecs;
135		};
136		struct iov_iter		iter;
137	};
138};
139
140struct ceph_osd_req_op {
141	u16 op;           /* CEPH_OSD_OP_* */
142	u32 flags;        /* CEPH_OSD_OP_FLAG_* */
143	u32 indata_len;   /* request */
144	u32 outdata_len;  /* reply */
145	s32 rval;
146
147	union {
148		struct ceph_osd_data raw_data_in;
149		struct {
150			u64 offset, length;
151			u64 truncate_size;
152			u32 truncate_seq;
153			int sparse_ext_cnt;
154			struct ceph_sparse_extent *sparse_ext;
155			struct ceph_osd_data osd_data;
156		} extent;
157		struct {
158			u32 name_len;
159			u32 value_len;
160			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
161			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
162			struct ceph_osd_data osd_data;
163		} xattr;
164		struct {
165			const char *class_name;
166			const char *method_name;
167			struct ceph_osd_data request_info;
168			struct ceph_osd_data request_data;
169			struct ceph_osd_data response_data;
170			__u8 class_len;
171			__u8 method_len;
172			u32 indata_len;
173		} cls;
174		struct {
175			u64 cookie;
176			__u8 op;           /* CEPH_OSD_WATCH_OP_ */
177			u32 gen;
178		} watch;
179		struct {
180			struct ceph_osd_data request_data;
181		} notify_ack;
182		struct {
183			u64 cookie;
184			struct ceph_osd_data request_data;
185			struct ceph_osd_data response_data;
186		} notify;
187		struct {
188			struct ceph_osd_data response_data;
189		} list_watchers;
190		struct {
191			u64 expected_object_size;
192			u64 expected_write_size;
193			u32 flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
194		} alloc_hint;
195		struct {
196			u64 snapid;
197			u64 src_version;
198			u8 flags;
199			u32 src_fadvise_flags;
200			struct ceph_osd_data osd_data;
201		} copy_from;
202		struct {
203			u64 ver;
204		} assert_ver;
205	};
206};
207
208struct ceph_osd_request_target {
209	struct ceph_object_id base_oid;
210	struct ceph_object_locator base_oloc;
211	struct ceph_object_id target_oid;
212	struct ceph_object_locator target_oloc;
213
214	struct ceph_pg pgid;               /* last raw pg we mapped to */
215	struct ceph_spg spgid;             /* last actual spg we mapped to */
216	u32 pg_num;
217	u32 pg_num_mask;
218	struct ceph_osds acting;
219	struct ceph_osds up;
220	int size;
221	int min_size;
222	bool sort_bitwise;
223	bool recovery_deletes;
224
225	unsigned int flags;                /* CEPH_OSD_FLAG_* */
226	bool used_replica;
227	bool paused;
228
229	u32 epoch;
230	u32 last_force_resend;
231
232	int osd;
233};
234
235/* an in-flight request */
236struct ceph_osd_request {
237	u64             r_tid;              /* unique for this client */
238	struct rb_node  r_node;
239	struct rb_node  r_mc_node;          /* map check */
240	struct work_struct r_complete_work;
241	struct ceph_osd *r_osd;
242
243	struct ceph_osd_request_target r_t;
244#define r_base_oid	r_t.base_oid
245#define r_base_oloc	r_t.base_oloc
246#define r_flags		r_t.flags
247
248	struct ceph_msg  *r_request, *r_reply;
249	u32               r_sent;      /* >0 if r_request is sending/sent */
250
251	/* request osd ops array  */
252	unsigned int		r_num_ops;
253
254	int               r_result;
255
256	struct ceph_osd_client *r_osdc;
257	struct kref       r_kref;
258	bool              r_mempool;
259	bool		  r_linger;           /* don't resend on failure */
260	struct completion r_completion;       /* private to osd_client.c */
261	ceph_osdc_callback_t r_callback;
262
263	struct inode *r_inode;         	      /* for use by callbacks */
264	struct list_head r_private_item;      /* ditto */
265	void *r_priv;			      /* ditto */
266
267	/* set by submitter */
268	u64 r_snapid;                         /* for reads, CEPH_NOSNAP o/w */
269	struct ceph_snap_context *r_snapc;    /* for writes */
270	struct timespec64 r_mtime;            /* ditto */
271	u64 r_data_offset;                    /* ditto */
272
273	/* internal */
274	u64 r_version;			      /* data version sent in reply */
275	unsigned long r_stamp;                /* jiffies, send or check time */
276	unsigned long r_start_stamp;          /* jiffies */
277	ktime_t r_start_latency;              /* ktime_t */
278	ktime_t r_end_latency;                /* ktime_t */
279	int r_attempts;
280	u32 r_map_dne_bound;
281
282	struct ceph_osd_req_op r_ops[] __counted_by(r_num_ops);
283};
284
285struct ceph_request_redirect {
286	struct ceph_object_locator oloc;
287};
288
289/*
290 * osd request identifier
291 *
292 * caller name + incarnation# + tid to unique identify this request
293 */
294struct ceph_osd_reqid {
295	struct ceph_entity_name name;
296	__le64 tid;
297	__le32 inc;
298} __packed;
299
300struct ceph_blkin_trace_info {
301	__le64 trace_id;
302	__le64 span_id;
303	__le64 parent_span_id;
304} __packed;
305
306typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
307				 u64 notifier_id, void *data, size_t data_len);
308typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
309
310struct ceph_osd_linger_request {
311	struct ceph_osd_client *osdc;
312	u64 linger_id;
313	bool committed;
314	bool is_watch;                  /* watch or notify */
315
316	struct ceph_osd *osd;
317	struct ceph_osd_request *reg_req;
318	struct ceph_osd_request *ping_req;
319	unsigned long ping_sent;
320	unsigned long watch_valid_thru;
321	struct list_head pending_lworks;
322
323	struct ceph_osd_request_target t;
324	u32 map_dne_bound;
325
326	struct timespec64 mtime;
327
328	struct kref kref;
329	struct mutex lock;
330	struct rb_node node;            /* osd */
331	struct rb_node osdc_node;       /* osdc */
332	struct rb_node mc_node;         /* map check */
333	struct list_head scan_item;
334
335	struct completion reg_commit_wait;
336	struct completion notify_finish_wait;
337	int reg_commit_error;
338	int notify_finish_error;
339	int last_error;
340
341	u32 register_gen;
342	u64 notify_id;
343
344	rados_watchcb2_t wcb;
345	rados_watcherrcb_t errcb;
346	void *data;
347
348	struct ceph_pagelist *request_pl;
349	struct page **notify_id_pages;
350
351	struct page ***preply_pages;
352	size_t *preply_len;
353};
354
355struct ceph_watch_item {
356	struct ceph_entity_name name;
357	u64 cookie;
358	struct ceph_entity_addr addr;
359};
360
361struct ceph_spg_mapping {
362	struct rb_node node;
363	struct ceph_spg spgid;
364
365	struct rb_root backoffs;
366};
367
368struct ceph_hobject_id {
369	void *key;
370	size_t key_len;
371	void *oid;
372	size_t oid_len;
373	u64 snapid;
374	u32 hash;
375	u8 is_max;
376	void *nspace;
377	size_t nspace_len;
378	s64 pool;
379
380	/* cache */
381	u32 hash_reverse_bits;
382};
383
384static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid)
385{
386	hoid->hash_reverse_bits = bitrev32(hoid->hash);
387}
388
389/*
390 * PG-wide backoff: [begin, end)
391 * per-object backoff: begin == end
392 */
393struct ceph_osd_backoff {
394	struct rb_node spg_node;
395	struct rb_node id_node;
396
397	struct ceph_spg spgid;
398	u64 id;
399	struct ceph_hobject_id *begin;
400	struct ceph_hobject_id *end;
401};
402
403#define CEPH_LINGER_ID_START	0xffff000000000000ULL
404
405struct ceph_osd_client {
406	struct ceph_client     *client;
407
408	struct ceph_osdmap     *osdmap;       /* current map */
409	struct rw_semaphore    lock;
410
411	struct rb_root         osds;          /* osds */
412	struct list_head       osd_lru;       /* idle osds */
413	spinlock_t             osd_lru_lock;
414	u32		       epoch_barrier;
415	struct ceph_osd        homeless_osd;
416	atomic64_t             last_tid;      /* tid of last request */
417	u64                    last_linger_id;
418	struct rb_root         linger_requests; /* lingering requests */
419	struct rb_root         map_checks;
420	struct rb_root         linger_map_checks;
421	atomic_t               num_requests;
422	atomic_t               num_homeless;
423	int                    abort_err;
424	struct delayed_work    timeout_work;
425	struct delayed_work    osds_timeout_work;
426#ifdef CONFIG_DEBUG_FS
427	struct dentry 	       *debugfs_file;
428#endif
429
430	mempool_t              *req_mempool;
431
432	struct ceph_msgpool	msgpool_op;
433	struct ceph_msgpool	msgpool_op_reply;
434
435	struct workqueue_struct	*notify_wq;
436	struct workqueue_struct	*completion_wq;
437};
438
439static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag)
440{
441	return osdc->osdmap->flags & flag;
442}
443
444extern int ceph_osdc_setup(void);
445extern void ceph_osdc_cleanup(void);
446
447extern int ceph_osdc_init(struct ceph_osd_client *osdc,
448			  struct ceph_client *client);
449extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
450extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
451
452extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
453				   struct ceph_msg *msg);
454extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
455				 struct ceph_msg *msg);
456void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
457void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
458void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc);
459
460#define osd_req_op_data(oreq, whch, typ, fld)				\
461({									\
462	struct ceph_osd_request *__oreq = (oreq);			\
463	unsigned int __whch = (whch);					\
464	BUG_ON(__whch >= __oreq->r_num_ops);				\
465	&__oreq->r_ops[__whch].typ.fld;					\
466})
467
468struct ceph_osd_req_op *osd_req_op_init(struct ceph_osd_request *osd_req,
469			    unsigned int which, u16 opcode, u32 flags);
470
471extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
472					unsigned int which,
473					struct page **pages, u64 length,
474					u32 alignment, bool pages_from_pool,
475					bool own_pages);
476
477extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
478					unsigned int which, u16 opcode,
479					u64 offset, u64 length,
480					u64 truncate_size, u32 truncate_seq);
481extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
482					unsigned int which, u64 length);
483extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
484				       unsigned int which, u64 offset_inc);
485
486extern struct ceph_osd_data *osd_req_op_extent_osd_data(
487					struct ceph_osd_request *osd_req,
488					unsigned int which);
489
490extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
491					unsigned int which,
492					struct page **pages, u64 length,
493					u32 alignment, bool pages_from_pool,
494					bool own_pages);
495extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *,
496					unsigned int which,
497					struct ceph_pagelist *pagelist);
498#ifdef CONFIG_BLOCK
499void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
500				    unsigned int which,
501				    struct ceph_bio_iter *bio_pos,
502				    u32 bio_length);
503#endif /* CONFIG_BLOCK */
504void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req,
505				      unsigned int which,
506				      struct bio_vec *bvecs, u32 num_bvecs,
507				      u32 bytes);
508void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
509					 unsigned int which,
510					 struct ceph_bvec_iter *bvec_pos);
511void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
512				unsigned int which, struct iov_iter *iter);
513
514extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
515					unsigned int which,
516					struct ceph_pagelist *pagelist);
517extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *,
518					unsigned int which,
519					struct page **pages, u64 length,
520					u32 alignment, bool pages_from_pool,
521					bool own_pages);
522void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req,
523				       unsigned int which,
524				       struct bio_vec *bvecs, u32 num_bvecs,
525				       u32 bytes);
526extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
527					unsigned int which,
528					struct page **pages, u64 length,
529					u32 alignment, bool pages_from_pool,
530					bool own_pages);
531int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
532			const char *class, const char *method);
533extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
534				 u16 opcode, const char *name, const void *value,
535				 size_t size, u8 cmp_op, u8 cmp_mode);
536extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
537				       unsigned int which,
538				       u64 expected_object_size,
539				       u64 expected_write_size,
540				       u32 flags);
541extern int osd_req_op_copy_from_init(struct ceph_osd_request *req,
542				     u64 src_snapid, u64 src_version,
543				     struct ceph_object_id *src_oid,
544				     struct ceph_object_locator *src_oloc,
545				     u32 src_fadvise_flags,
546				     u32 dst_fadvise_flags,
547				     u32 truncate_seq, u64 truncate_size,
548				     u8 copy_from_flags);
549
550extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
551					       struct ceph_snap_context *snapc,
552					       unsigned int num_ops,
553					       bool use_mempool,
554					       gfp_t gfp_flags);
555int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
556
557extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
558				      struct ceph_file_layout *layout,
559				      struct ceph_vino vino,
560				      u64 offset, u64 *len,
561				      unsigned int which, int num_ops,
562				      int opcode, int flags,
563				      struct ceph_snap_context *snapc,
564				      u32 truncate_seq, u64 truncate_size,
565				      bool use_mempool);
566
567int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt);
568
569/*
570 * How big an extent array should we preallocate for a sparse read? This is
571 * just a starting value.  If we get more than this back from the OSD, the
572 * receiver will reallocate.
573 */
574#define CEPH_SPARSE_EXT_ARRAY_INITIAL  16
575
576static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt)
577{
578	if (!cnt)
579		cnt = CEPH_SPARSE_EXT_ARRAY_INITIAL;
580
581	return __ceph_alloc_sparse_ext_map(op, cnt);
582}
583
584extern void ceph_osdc_get_request(struct ceph_osd_request *req);
585extern void ceph_osdc_put_request(struct ceph_osd_request *req);
586
587void ceph_osdc_start_request(struct ceph_osd_client *osdc,
588			     struct ceph_osd_request *req);
589extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
590extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
591				  struct ceph_osd_request *req);
592extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
593
594extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
595void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
596
597int ceph_osdc_call(struct ceph_osd_client *osdc,
598		   struct ceph_object_id *oid,
599		   struct ceph_object_locator *oloc,
600		   const char *class, const char *method,
601		   unsigned int flags,
602		   struct page *req_page, size_t req_len,
603		   struct page **resp_pages, size_t *resp_len);
604
605/* watch/notify */
606struct ceph_osd_linger_request *
607ceph_osdc_watch(struct ceph_osd_client *osdc,
608		struct ceph_object_id *oid,
609		struct ceph_object_locator *oloc,
610		rados_watchcb2_t wcb,
611		rados_watcherrcb_t errcb,
612		void *data);
613int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
614		      struct ceph_osd_linger_request *lreq);
615
616int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
617			 struct ceph_object_id *oid,
618			 struct ceph_object_locator *oloc,
619			 u64 notify_id,
620			 u64 cookie,
621			 void *payload,
622			 u32 payload_len);
623int ceph_osdc_notify(struct ceph_osd_client *osdc,
624		     struct ceph_object_id *oid,
625		     struct ceph_object_locator *oloc,
626		     void *payload,
627		     u32 payload_len,
628		     u32 timeout,
629		     struct page ***preply_pages,
630		     size_t *preply_len);
631int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
632			  struct ceph_osd_linger_request *lreq);
633int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
634			    struct ceph_object_id *oid,
635			    struct ceph_object_locator *oloc,
636			    struct ceph_watch_item **watchers,
637			    u32 *num_watchers);
638
639/* Find offset into the buffer of the end of the extent map */
640static inline u64 ceph_sparse_ext_map_end(struct ceph_osd_req_op *op)
641{
642	struct ceph_sparse_extent *ext;
643
644	/* No extents? No data */
645	if (op->extent.sparse_ext_cnt == 0)
646		return 0;
647
648	ext = &op->extent.sparse_ext[op->extent.sparse_ext_cnt - 1];
649
650	return ext->off + ext->len - op->extent.offset;
651}
652
653#endif
654