1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Bad block management
4 *
5 * - Heavily based on MD badblocks code from Neil Brown
6 *
7 * Copyright (c) 2015, Intel Corporation.
8 */
9
10#include <linux/badblocks.h>
11#include <linux/seqlock.h>
12#include <linux/device.h>
13#include <linux/kernel.h>
14#include <linux/module.h>
15#include <linux/stddef.h>
16#include <linux/types.h>
17#include <linux/slab.h>
18
19/*
20 * The purpose of badblocks set/clear is to manage bad blocks ranges which are
21 * identified by LBA addresses.
22 *
23 * When the caller of badblocks_set() wants to set a range of bad blocks, the
24 * setting range can be acked or unacked. And the setting range may merge,
25 * overwrite, skip the overlapped already set range, depends on who they are
26 * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
27 * more complicated when the setting range covers multiple already set bad block
28 * ranges, with restrictions of maximum length of each bad range and the bad
29 * table space limitation.
30 *
31 * It is difficult and unnecessary to take care of all the possible situations,
32 * for setting a large range of bad blocks, we can handle it by dividing the
33 * large range into smaller ones when encounter overlap, max range length or
34 * bad table full conditions. Every time only a smaller piece of the bad range
35 * is handled with a limited number of conditions how it is interacted with
36 * possible overlapped or adjacent already set bad block ranges. Then the hard
37 * complicated problem can be much simpler to handle in proper way.
38 *
39 * When setting a range of bad blocks to the bad table, the simplified situations
40 * to be considered are, (The already set bad blocks ranges are naming with
41 *  prefix E, and the setting bad blocks range is naming with prefix S)
42 *
43 * 1) A setting range is not overlapped or adjacent to any other already set bad
44 *    block range.
45 *                         +--------+
46 *                         |    S   |
47 *                         +--------+
48 *        +-------------+               +-------------+
49 *        |      E1     |               |      E2     |
50 *        +-------------+               +-------------+
51 *    For this situation if the bad blocks table is not full, just allocate a
52 *    free slot from the bad blocks table to mark the setting range S. The
53 *    result is,
54 *        +-------------+  +--------+   +-------------+
55 *        |      E1     |  |    S   |   |      E2     |
56 *        +-------------+  +--------+   +-------------+
57 * 2) A setting range starts exactly at a start LBA of an already set bad blocks
58 *    range.
59 * 2.1) The setting range size < already set range size
60 *        +--------+
61 *        |    S   |
62 *        +--------+
63 *        +-------------+
64 *        |      E      |
65 *        +-------------+
66 * 2.1.1) If S and E are both acked or unacked range, the setting range S can
67 *    be merged into existing bad range E. The result is,
68 *        +-------------+
69 *        |      S      |
70 *        +-------------+
71 * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
72 *    the result is,
73 *        +-------------+
74 *        |      E      |
75 *        +-------------+
76 * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
77 *    An extra slot from the bad blocks table will be allocated for S, and head
78 *    of E will move to end of the inserted range S. The result is,
79 *        +--------+----+
80 *        |    S   | E  |
81 *        +--------+----+
82 * 2.2) The setting range size == already set range size
83 * 2.2.1) If S and E are both acked or unacked range, the setting range S can
84 *    be merged into existing bad range E. The result is,
85 *        +-------------+
86 *        |      S      |
87 *        +-------------+
88 * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
89 *    the result is,
90 *        +-------------+
91 *        |      E      |
92 *        +-------------+
93 * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
94      bad blocks range E. The result is,
95 *        +-------------+
96 *        |      S      |
97 *        +-------------+
98 * 2.3) The setting range size > already set range size
99 *        +-------------------+
100 *        |          S        |
101 *        +-------------------+
102 *        +-------------+
103 *        |      E      |
104 *        +-------------+
105 *    For such situation, the setting range S can be treated as two parts, the
106 *    first part (S1) is as same size as the already set range E, the second
107 *    part (S2) is the rest of setting range.
108 *        +-------------+-----+        +-------------+       +-----+
109 *        |    S1       | S2  |        |     S1      |       | S2  |
110 *        +-------------+-----+  ===>  +-------------+       +-----+
111 *        +-------------+              +-------------+
112 *        |      E      |              |      E      |
113 *        +-------------+              +-------------+
114 *    Now we only focus on how to handle the setting range S1 and already set
115 *    range E, which are already explained in 2.2), for the rest S2 it will be
116 *    handled later in next loop.
117 * 3) A setting range starts before the start LBA of an already set bad blocks
118 *    range.
119 *        +-------------+
120 *        |      S      |
121 *        +-------------+
122 *             +-------------+
123 *             |      E      |
124 *             +-------------+
125 *    For this situation, the setting range S can be divided into two parts, the
126 *    first (S1) ends at the start LBA of already set range E, the second part
127 *    (S2) starts exactly at a start LBA of the already set range E.
128 *        +----+---------+             +----+      +---------+
129 *        | S1 |    S2   |             | S1 |      |    S2   |
130 *        +----+---------+      ===>   +----+      +---------+
131 *             +-------------+                     +-------------+
132 *             |      E      |                     |      E      |
133 *             +-------------+                     +-------------+
134 *    Now only the first part S1 should be handled in this loop, which is in
135 *    similar condition as 1). The rest part S2 has exact same start LBA address
136 *    of the already set range E, they will be handled in next loop in one of
137 *    situations in 2).
138 * 4) A setting range starts after the start LBA of an already set bad blocks
139 *    range.
140 * 4.1) If the setting range S exactly matches the tail part of already set bad
141 *    blocks range E, like the following chart shows,
142 *            +---------+
143 *            |   S     |
144 *            +---------+
145 *        +-------------+
146 *        |      E      |
147 *        +-------------+
148 * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
149 *    they will be merged into one, the result is,
150 *        +-------------+
151 *        |      S      |
152 *        +-------------+
153 * 4.1.2) If range E is acked and the setting range S is unacked, the setting
154 *    request of S will be rejected, the result is,
155 *        +-------------+
156 *        |      E      |
157 *        +-------------+
158 * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
159 *    overwrite the overlapped range of E, the result is,
160 *        +---+---------+
161 *        | E |    S    |
162 *        +---+---------+
163 * 4.2) If the setting range S stays in middle of an already set range E, like
164 *    the following chart shows,
165 *             +----+
166 *             | S  |
167 *             +----+
168 *        +--------------+
169 *        |       E      |
170 *        +--------------+
171 * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
172 *    they will be merged into one, the result is,
173 *        +--------------+
174 *        |       S      |
175 *        +--------------+
176 * 4.2.2) If range E is acked and the setting range S is unacked, the setting
177 *    request of S will be rejected, the result is also,
178 *        +--------------+
179 *        |       E      |
180 *        +--------------+
181 * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
182 *    inserted into middle of E and split previous range E into two parts (E1
183 *    and E2), the result is,
184 *        +----+----+----+
185 *        | E1 |  S | E2 |
186 *        +----+----+----+
187 * 4.3) If the setting bad blocks range S is overlapped with an already set bad
188 *    blocks range E. The range S starts after the start LBA of range E, and
189 *    ends after the end LBA of range E, as the following chart shows,
190 *            +-------------------+
191 *            |          S        |
192 *            +-------------------+
193 *        +-------------+
194 *        |      E      |
195 *        +-------------+
196 *    For this situation the range S can be divided into two parts, the first
197 *    part (S1) ends at end range E, and the second part (S2) has rest range of
198 *    origin S.
199 *            +---------+---------+            +---------+      +---------+
200 *            |    S1   |    S2   |            |    S1   |      |    S2   |
201 *            +---------+---------+  ===>      +---------+      +---------+
202 *        +-------------+                  +-------------+
203 *        |      E      |                  |      E      |
204 *        +-------------+                  +-------------+
205 *     Now in this loop the setting range S1 and already set range E can be
206 *     handled as the situations 4.1), the rest range S2 will be handled in next
207 *     loop and ignored in this loop.
208 * 5) A setting bad blocks range S is adjacent to one or more already set bad
209 *    blocks range(s), and they are all acked or unacked range.
210 * 5.1) Front merge: If the already set bad blocks range E is before setting
211 *    range S and they are adjacent,
212 *                +------+
213 *                |  S   |
214 *                +------+
215 *        +-------+
216 *        |   E   |
217 *        +-------+
218 * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
219 *    values are same, the setting range S can front merges into range E. The
220 *    result is,
221 *        +--------------+
222 *        |       S      |
223 *        +--------------+
224 * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
225 *    range S right after already set range E into the bad blocks table. The
226 *    result is,
227 *        +--------+------+
228 *        |   E    |   S  |
229 *        +--------+------+
230 * 6) Special cases which above conditions cannot handle
231 * 6.1) Multiple already set ranges may merge into less ones in a full bad table
232 *        +-------------------------------------------------------+
233 *        |                           S                           |
234 *        +-------------------------------------------------------+
235 *        |<----- BB_MAX_LEN ----->|
236 *                                 +-----+     +-----+   +-----+
237 *                                 | E1  |     | E2  |   | E3  |
238 *                                 +-----+     +-----+   +-----+
239 *     In the above example, when the bad blocks table is full, inserting the
240 *     first part of setting range S will fail because no more available slot
241 *     can be allocated from bad blocks table. In this situation a proper
242 *     setting method should be go though all the setting bad blocks range and
243 *     look for chance to merge already set ranges into less ones. When there
244 *     is available slot from bad blocks table, re-try again to handle more
245 *     setting bad blocks ranges as many as possible.
246 *        +------------------------+
247 *        |          S3            |
248 *        +------------------------+
249 *        |<----- BB_MAX_LEN ----->|
250 *                                 +-----+-----+-----+---+-----+--+
251 *                                 |       S1        |     S2     |
252 *                                 +-----+-----+-----+---+-----+--+
253 *     The above chart shows although the first part (S3) cannot be inserted due
254 *     to no-space in bad blocks table, but the following E1, E2 and E3 ranges
255 *     can be merged with rest part of S into less range S1 and S2. Now there is
256 *     1 free slot in bad blocks table.
257 *        +------------------------+-----+-----+-----+---+-----+--+
258 *        |           S3           |       S1        |     S2     |
259 *        +------------------------+-----+-----+-----+---+-----+--+
260 *     Since the bad blocks table is not full anymore, re-try again for the
261 *     origin setting range S. Now the setting range S3 can be inserted into the
262 *     bad blocks table with previous freed slot from multiple ranges merge.
263 * 6.2) Front merge after overwrite
264 *    In the following example, in bad blocks table, E1 is an acked bad blocks
265 *    range and E2 is an unacked bad blocks range, therefore they are not able
266 *    to merge into a larger range. The setting bad blocks range S is acked,
267 *    therefore part of E2 can be overwritten by S.
268 *                      +--------+
269 *                      |    S   |                             acknowledged
270 *                      +--------+                         S:       1
271 *              +-------+-------------+                   E1:       1
272 *              |   E1  |    E2       |                   E2:       0
273 *              +-------+-------------+
274 *     With previous simplified routines, after overwriting part of E2 with S,
275 *     the bad blocks table should be (E3 is remaining part of E2 which is not
276 *     overwritten by S),
277 *                                                             acknowledged
278 *              +-------+--------+----+                    S:       1
279 *              |   E1  |    S   | E3 |                   E1:       1
280 *              +-------+--------+----+                   E3:       0
281 *     The above result is correct but not perfect. Range E1 and S in the bad
282 *     blocks table are all acked, merging them into a larger one range may
283 *     occupy less bad blocks table space and make badblocks_check() faster.
284 *     Therefore in such situation, after overwriting range S, the previous range
285 *     E1 should be checked for possible front combination. Then the ideal
286 *     result can be,
287 *              +----------------+----+                        acknowledged
288 *              |       E1       | E3 |                   E1:       1
289 *              +----------------+----+                   E3:       0
290 * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
291 *    range S and they are adjacent. Normally we don't need to care about this
292 *    because front merge handles this while going though range S from head to
293 *    tail, except for the tail part of range S. When the setting range S are
294 *    fully handled, all the above simplified routine doesn't check whether the
295 *    tail LBA of range S is adjacent to the next already set range and not
296 *    merge them even it is possible.
297 *        +------+
298 *        |  S   |
299 *        +------+
300 *               +-------+
301 *               |   E   |
302 *               +-------+
303 *    For the above special situation, when the setting range S are all handled
304 *    and the loop ends, an extra check is necessary for whether next already
305 *    set range E is right after S and mergeable.
306 * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
307 *    values are same, the setting range S can behind merges into range E. The
308 *    result is,
309 *        +--------------+
310 *        |       S      |
311 *        +--------------+
312 * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range
313 *     S in front of the already set range E in the bad blocks table. The result
314 *     is,
315 *        +------+-------+
316 *        |  S   |   E   |
317 *        +------+-------+
318 *
319 * All the above 5 simplified situations and 3 special cases may cover 99%+ of
320 * the bad block range setting conditions. Maybe there is some rare corner case
321 * is not considered and optimized, it won't hurt if badblocks_set() fails due
322 * to no space, or some ranges are not merged to save bad blocks table space.
323 *
324 * Inside badblocks_set() each loop starts by jumping to re_insert label, every
325 * time for the new loop prev_badblocks() is called to find an already set range
326 * which starts before or at current setting range. Since the setting bad blocks
327 * range is handled from head to tail, most of the cases it is unnecessary to do
328 * the binary search inside prev_badblocks(), it is possible to provide a hint
329 * to prev_badblocks() for a fast path, then the expensive binary search can be
330 * avoided. In my test with the hint to prev_badblocks(), except for the first
331 * loop, all rested calls to prev_badblocks() can go into the fast path and
332 * return correct bad blocks table index immediately.
333 *
334 *
335 * Clearing a bad blocks range from the bad block table has similar idea as
336 * setting does, but much more simpler. The only thing needs to be noticed is
337 * when the clearing range hits middle of a bad block range, the existing bad
338 * block range will split into two, and one more item should be added into the
339 * bad block table. The simplified situations to be considered are, (The already
340 * set bad blocks ranges in bad block table are naming with prefix E, and the
341 * clearing bad blocks range is naming with prefix C)
342 *
343 * 1) A clearing range is not overlapped to any already set ranges in bad block
344 *    table.
345 *    +-----+         |          +-----+         |          +-----+
346 *    |  C  |         |          |  C  |         |          |  C  |
347 *    +-----+         or         +-----+         or         +-----+
348 *            +---+   |   +----+         +----+  |  +---+
349 *            | E |   |   | E1 |         | E2 |  |  | E |
350 *            +---+   |   +----+         +----+  |  +---+
351 *    For the above situations, no bad block to be cleared and no failure
352 *    happens, simply returns 0.
353 * 2) The clearing range hits middle of an already setting bad blocks range in
354 *    the bad block table.
355 *            +---+
356 *            | C |
357 *            +---+
358 *     +-----------------+
359 *     |         E       |
360 *     +-----------------+
361 *    In this situation if the bad block table is not full, the range E will be
362 *    split into two ranges E1 and E2. The result is,
363 *     +------+   +------+
364 *     |  E1  |   |  E2  |
365 *     +------+   +------+
366 * 3) The clearing range starts exactly at same LBA as an already set bad block range
367 *    from the bad block table.
368 * 3.1) Partially covered at head part
369 *         +------------+
370 *         |     C      |
371 *         +------------+
372 *         +-----------------+
373 *         |         E       |
374 *         +-----------------+
375 *    For this situation, the overlapped already set range will update the
376 *    start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
377 *    item deleted from bad block table. The result is,
378 *                      +----+
379 *                      | E1 |
380 *                      +----+
381 * 3.2) Exact fully covered
382 *         +-----------------+
383 *         |         C       |
384 *         +-----------------+
385 *         +-----------------+
386 *         |         E       |
387 *         +-----------------+
388 *    For this situation the whole bad blocks range E will be cleared and its
389 *    corresponded item is deleted from the bad block table.
390 * 4) The clearing range exactly ends at same LBA as an already set bad block
391 *    range.
392 *                   +-------+
393 *                   |   C   |
394 *                   +-------+
395 *         +-----------------+
396 *         |         E       |
397 *         +-----------------+
398 *    For the above situation, the already set range E is updated to shrink its
399 *    end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
400 *    The result is,
401 *         +---------+
402 *         |    E    |
403 *         +---------+
404 * 5) The clearing range is partially overlapped with an already set bad block
405 *    range from the bad block table.
406 * 5.1) The already set bad block range is front overlapped with the clearing
407 *    range.
408 *         +----------+
409 *         |     C    |
410 *         +----------+
411 *              +------------+
412 *              |      E     |
413 *              +------------+
414 *   For such situation, the clearing range C can be treated as two parts. The
415 *   first part ends at the start LBA of range E, and the second part starts at
416 *   same LBA of range E.
417 *         +----+-----+               +----+   +-----+
418 *         | C1 | C2  |               | C1 |   | C2  |
419 *         +----+-----+         ===>  +----+   +-----+
420 *              +------------+                 +------------+
421 *              |      E     |                 |      E     |
422 *              +------------+                 +------------+
423 *   Now the first part C1 can be handled as condition 1), and the second part C2 can be
424 *   handled as condition 3.1) in next loop.
425 * 5.2) The already set bad block range is behind overlaopped with the clearing
426 *   range.
427 *                 +----------+
428 *                 |     C    |
429 *                 +----------+
430 *         +------------+
431 *         |      E     |
432 *         +------------+
433 *   For such situation, the clearing range C can be treated as two parts. The
434 *   first part C1 ends at same end LBA of range E, and the second part starts
435 *   at end LBA of range E.
436 *                 +----+-----+                 +----+    +-----+
437 *                 | C1 | C2  |                 | C1 |    | C2  |
438 *                 +----+-----+  ===>           +----+    +-----+
439 *         +------------+               +------------+
440 *         |      E     |               |      E     |
441 *         +------------+               +------------+
442 *   Now the first part clearing range C1 can be handled as condition 4), and
443 *   the second part clearing range C2 can be handled as condition 1) in next
444 *   loop.
445 *
446 *   All bad blocks range clearing can be simplified into the above 5 situations
447 *   by only handling the head part of the clearing range in each run of the
448 *   while-loop. The idea is similar to bad blocks range setting but much
449 *   simpler.
450 */
451
452/*
453 * Find the range starts at-or-before 's' from bad table. The search
454 * starts from index 'hint' and stops at index 'hint_end' from the bad
455 * table.
456 */
457static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
458{
459	int hint_end = hint + 2;
460	u64 *p = bb->page;
461	int ret = -1;
462
463	while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
464	       (BB_OFFSET(p[hint]) <= s)) {
465		if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
466			ret = hint;
467			break;
468		}
469		hint++;
470	}
471
472	return ret;
473}
474
475/*
476 * Find the range starts at-or-before bad->start. If 'hint' is provided
477 * (hint >= 0) then search in the bad table from hint firstly. It is
478 * very probably the wanted bad range can be found from the hint index,
479 * then the unnecessary while-loop iteration can be avoided.
480 */
481static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
482			  int hint)
483{
484	sector_t s = bad->start;
485	int ret = -1;
486	int lo, hi;
487	u64 *p;
488
489	if (!bb->count)
490		goto out;
491
492	if (hint >= 0) {
493		ret = prev_by_hint(bb, s, hint);
494		if (ret >= 0)
495			goto out;
496	}
497
498	lo = 0;
499	hi = bb->count;
500	p = bb->page;
501
502	/* The following bisect search might be unnecessary */
503	if (BB_OFFSET(p[lo]) > s)
504		return -1;
505	if (BB_OFFSET(p[hi - 1]) <= s)
506		return hi - 1;
507
508	/* Do bisect search in bad table */
509	while (hi - lo > 1) {
510		int mid = (lo + hi)/2;
511		sector_t a = BB_OFFSET(p[mid]);
512
513		if (a == s) {
514			ret = mid;
515			goto out;
516		}
517
518		if (a < s)
519			lo = mid;
520		else
521			hi = mid;
522	}
523
524	if (BB_OFFSET(p[lo]) <= s)
525		ret = lo;
526out:
527	return ret;
528}
529
530/*
531 * Return 'true' if the range indicated by 'bad' can be backward merged
532 * with the bad range (from the bad table) index by 'behind'.
533 */
534static bool can_merge_behind(struct badblocks *bb,
535			     struct badblocks_context *bad, int behind)
536{
537	sector_t sectors = bad->len;
538	sector_t s = bad->start;
539	u64 *p = bb->page;
540
541	if ((s < BB_OFFSET(p[behind])) &&
542	    ((s + sectors) >= BB_OFFSET(p[behind])) &&
543	    ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
544	    BB_ACK(p[behind]) == bad->ack)
545		return true;
546	return false;
547}
548
549/*
550 * Do backward merge for range indicated by 'bad' and the bad range
551 * (from the bad table) indexed by 'behind'. The return value is merged
552 * sectors from bad->len.
553 */
554static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
555			int behind)
556{
557	sector_t sectors = bad->len;
558	sector_t s = bad->start;
559	u64 *p = bb->page;
560	int merged = 0;
561
562	WARN_ON(s >= BB_OFFSET(p[behind]));
563	WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
564
565	if (s < BB_OFFSET(p[behind])) {
566		merged = BB_OFFSET(p[behind]) - s;
567		p[behind] =  BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
568
569		WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
570	}
571
572	return merged;
573}
574
575/*
576 * Return 'true' if the range indicated by 'bad' can be forward
577 * merged with the bad range (from the bad table) indexed by 'prev'.
578 */
579static bool can_merge_front(struct badblocks *bb, int prev,
580			    struct badblocks_context *bad)
581{
582	sector_t s = bad->start;
583	u64 *p = bb->page;
584
585	if (BB_ACK(p[prev]) == bad->ack &&
586	    (s < BB_END(p[prev]) ||
587	     (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
588		return true;
589	return false;
590}
591
592/*
593 * Do forward merge for range indicated by 'bad' and the bad range
594 * (from bad table) indexed by 'prev'. The return value is sectors
595 * merged from bad->len.
596 */
597static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
598{
599	sector_t sectors = bad->len;
600	sector_t s = bad->start;
601	u64 *p = bb->page;
602	int merged = 0;
603
604	WARN_ON(s > BB_END(p[prev]));
605
606	if (s < BB_END(p[prev])) {
607		merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
608	} else {
609		merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
610		if ((prev + 1) < bb->count &&
611		    merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
612			merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
613		}
614
615		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
616				  BB_LEN(p[prev]) + merged, bad->ack);
617	}
618
619	return merged;
620}
621
622/*
623 * 'Combine' is a special case which can_merge_front() is not able to
624 * handle: If a bad range (indexed by 'prev' from bad table) exactly
625 * starts as bad->start, and the bad range ahead of 'prev' (indexed by
626 * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
627 * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
628 * these two bad range (from bad table) can be combined.
629 *
630 * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
631 * table can be combined.
632 */
633static bool can_combine_front(struct badblocks *bb, int prev,
634			      struct badblocks_context *bad)
635{
636	u64 *p = bb->page;
637
638	if ((prev > 0) &&
639	    (BB_OFFSET(p[prev]) == bad->start) &&
640	    (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
641	    (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
642	    (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
643		return true;
644	return false;
645}
646
647/*
648 * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
649 * table) into one larger bad range, and the new range is indexed by
650 * 'prev - 1'.
651 * The caller of front_combine() will decrease bb->count, therefore
652 * it is unnecessary to clear p[perv] after front merge.
653 */
654static void front_combine(struct badblocks *bb, int prev)
655{
656	u64 *p = bb->page;
657
658	p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
659			      BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
660			      BB_ACK(p[prev]));
661	if ((prev + 1) < bb->count)
662		memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
663}
664
665/*
666 * Return 'true' if the range indicated by 'bad' is exactly forward
667 * overlapped with the bad range (from bad table) indexed by 'front'.
668 * Exactly forward overlap means the bad range (from bad table) indexed
669 * by 'prev' does not cover the whole range indicated by 'bad'.
670 */
671static bool overlap_front(struct badblocks *bb, int front,
672			  struct badblocks_context *bad)
673{
674	u64 *p = bb->page;
675
676	if (bad->start >= BB_OFFSET(p[front]) &&
677	    bad->start < BB_END(p[front]))
678		return true;
679	return false;
680}
681
682/*
683 * Return 'true' if the range indicated by 'bad' is exactly backward
684 * overlapped with the bad range (from bad table) indexed by 'behind'.
685 */
686static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
687			   int behind)
688{
689	u64 *p = bb->page;
690
691	if (bad->start < BB_OFFSET(p[behind]) &&
692	    (bad->start + bad->len) > BB_OFFSET(p[behind]))
693		return true;
694	return false;
695}
696
697/*
698 * Return 'true' if the range indicated by 'bad' can overwrite the bad
699 * range (from bad table) indexed by 'prev'.
700 *
701 * The range indicated by 'bad' can overwrite the bad range indexed by
702 * 'prev' when,
703 * 1) The whole range indicated by 'bad' can cover partial or whole bad
704 *    range (from bad table) indexed by 'prev'.
705 * 2) The ack value of 'bad' is larger or equal to the ack value of bad
706 *    range 'prev'.
707 *
708 * If the overwriting doesn't cover the whole bad range (from bad table)
709 * indexed by 'prev', new range might be split from existing bad range,
710 * 1) The overwrite covers head or tail part of existing bad range, 1
711 *    extra bad range will be split and added into the bad table.
712 * 2) The overwrite covers middle of existing bad range, 2 extra bad
713 *    ranges will be split (ahead and after the overwritten range) and
714 *    added into the bad table.
715 * The number of extra split ranges of the overwriting is stored in
716 * 'extra' and returned for the caller.
717 */
718static bool can_front_overwrite(struct badblocks *bb, int prev,
719				struct badblocks_context *bad, int *extra)
720{
721	u64 *p = bb->page;
722	int len;
723
724	WARN_ON(!overlap_front(bb, prev, bad));
725
726	if (BB_ACK(p[prev]) >= bad->ack)
727		return false;
728
729	if (BB_END(p[prev]) <= (bad->start + bad->len)) {
730		len = BB_END(p[prev]) - bad->start;
731		if (BB_OFFSET(p[prev]) == bad->start)
732			*extra = 0;
733		else
734			*extra = 1;
735
736		bad->len = len;
737	} else {
738		if (BB_OFFSET(p[prev]) == bad->start)
739			*extra = 1;
740		else
741		/*
742		 * prev range will be split into two, beside the overwritten
743		 * one, an extra slot needed from bad table.
744		 */
745			*extra = 2;
746	}
747
748	if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
749		return false;
750
751	return true;
752}
753
754/*
755 * Do the overwrite from the range indicated by 'bad' to the bad range
756 * (from bad table) indexed by 'prev'.
757 * The previously called can_front_overwrite() will provide how many
758 * extra bad range(s) might be split and added into the bad table. All
759 * the splitting cases in the bad table will be handled here.
760 */
761static int front_overwrite(struct badblocks *bb, int prev,
762			   struct badblocks_context *bad, int extra)
763{
764	u64 *p = bb->page;
765	sector_t orig_end = BB_END(p[prev]);
766	int orig_ack = BB_ACK(p[prev]);
767
768	switch (extra) {
769	case 0:
770		p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
771				  bad->ack);
772		break;
773	case 1:
774		if (BB_OFFSET(p[prev]) == bad->start) {
775			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
776					  bad->len, bad->ack);
777			memmove(p + prev + 2, p + prev + 1,
778				(bb->count - prev - 1) * 8);
779			p[prev + 1] = BB_MAKE(bad->start + bad->len,
780					      orig_end - BB_END(p[prev]),
781					      orig_ack);
782		} else {
783			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
784					  bad->start - BB_OFFSET(p[prev]),
785					  orig_ack);
786			/*
787			 * prev +2 -> prev + 1 + 1, which is for,
788			 * 1) prev + 1: the slot index of the previous one
789			 * 2) + 1: one more slot for extra being 1.
790			 */
791			memmove(p + prev + 2, p + prev + 1,
792				(bb->count - prev - 1) * 8);
793			p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
794		}
795		break;
796	case 2:
797		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
798				  bad->start - BB_OFFSET(p[prev]),
799				  orig_ack);
800		/*
801		 * prev + 3 -> prev + 1 + 2, which is for,
802		 * 1) prev + 1: the slot index of the previous one
803		 * 2) + 2: two more slots for extra being 2.
804		 */
805		memmove(p + prev + 3, p + prev + 1,
806			(bb->count - prev - 1) * 8);
807		p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
808		p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
809				      orig_end - BB_END(p[prev + 1]),
810				      orig_ack);
811		break;
812	default:
813		break;
814	}
815
816	return bad->len;
817}
818
819/*
820 * Explicitly insert a range indicated by 'bad' to the bad table, where
821 * the location is indexed by 'at'.
822 */
823static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
824{
825	u64 *p = bb->page;
826	int len;
827
828	WARN_ON(badblocks_full(bb));
829
830	len = min_t(sector_t, bad->len, BB_MAX_LEN);
831	if (at < bb->count)
832		memmove(p + at + 1, p + at, (bb->count - at) * 8);
833	p[at] = BB_MAKE(bad->start, len, bad->ack);
834
835	return len;
836}
837
838static void badblocks_update_acked(struct badblocks *bb)
839{
840	bool unacked = false;
841	u64 *p = bb->page;
842	int i;
843
844	if (!bb->unacked_exist)
845		return;
846
847	for (i = 0; i < bb->count ; i++) {
848		if (!BB_ACK(p[i])) {
849			unacked = true;
850			break;
851		}
852	}
853
854	if (!unacked)
855		bb->unacked_exist = 0;
856}
857
858/* Do exact work to set bad block range into the bad block table */
859static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
860			  int acknowledged)
861{
862	int retried = 0, space_desired = 0;
863	int orig_len, len = 0, added = 0;
864	struct badblocks_context bad;
865	int prev = -1, hint = -1;
866	sector_t orig_start;
867	unsigned long flags;
868	int rv = 0;
869	u64 *p;
870
871	if (bb->shift < 0)
872		/* badblocks are disabled */
873		return 1;
874
875	if (sectors == 0)
876		/* Invalid sectors number */
877		return 1;
878
879	if (bb->shift) {
880		/* round the start down, and the end up */
881		sector_t next = s + sectors;
882
883		rounddown(s, bb->shift);
884		roundup(next, bb->shift);
885		sectors = next - s;
886	}
887
888	write_seqlock_irqsave(&bb->lock, flags);
889
890	orig_start = s;
891	orig_len = sectors;
892	bad.ack = acknowledged;
893	p = bb->page;
894
895re_insert:
896	bad.start = s;
897	bad.len = sectors;
898	len = 0;
899
900	if (badblocks_empty(bb)) {
901		len = insert_at(bb, 0, &bad);
902		bb->count++;
903		added++;
904		goto update_sectors;
905	}
906
907	prev = prev_badblocks(bb, &bad, hint);
908
909	/* start before all badblocks */
910	if (prev < 0) {
911		if (!badblocks_full(bb)) {
912			/* insert on the first */
913			if (bad.len > (BB_OFFSET(p[0]) - bad.start))
914				bad.len = BB_OFFSET(p[0]) - bad.start;
915			len = insert_at(bb, 0, &bad);
916			bb->count++;
917			added++;
918			hint = 0;
919			goto update_sectors;
920		}
921
922		/* No sapce, try to merge */
923		if (overlap_behind(bb, &bad, 0)) {
924			if (can_merge_behind(bb, &bad, 0)) {
925				len = behind_merge(bb, &bad, 0);
926				added++;
927			} else {
928				len = BB_OFFSET(p[0]) - s;
929				space_desired = 1;
930			}
931			hint = 0;
932			goto update_sectors;
933		}
934
935		/* no table space and give up */
936		goto out;
937	}
938
939	/* in case p[prev-1] can be merged with p[prev] */
940	if (can_combine_front(bb, prev, &bad)) {
941		front_combine(bb, prev);
942		bb->count--;
943		added++;
944		hint = prev;
945		goto update_sectors;
946	}
947
948	if (overlap_front(bb, prev, &bad)) {
949		if (can_merge_front(bb, prev, &bad)) {
950			len = front_merge(bb, prev, &bad);
951			added++;
952		} else {
953			int extra = 0;
954
955			if (!can_front_overwrite(bb, prev, &bad, &extra)) {
956				len = min_t(sector_t,
957					    BB_END(p[prev]) - s, sectors);
958				hint = prev;
959				goto update_sectors;
960			}
961
962			len = front_overwrite(bb, prev, &bad, extra);
963			added++;
964			bb->count += extra;
965
966			if (can_combine_front(bb, prev, &bad)) {
967				front_combine(bb, prev);
968				bb->count--;
969			}
970		}
971		hint = prev;
972		goto update_sectors;
973	}
974
975	if (can_merge_front(bb, prev, &bad)) {
976		len = front_merge(bb, prev, &bad);
977		added++;
978		hint = prev;
979		goto update_sectors;
980	}
981
982	/* if no space in table, still try to merge in the covered range */
983	if (badblocks_full(bb)) {
984		/* skip the cannot-merge range */
985		if (((prev + 1) < bb->count) &&
986		    overlap_behind(bb, &bad, prev + 1) &&
987		    ((s + sectors) >= BB_END(p[prev + 1]))) {
988			len = BB_END(p[prev + 1]) - s;
989			hint = prev + 1;
990			goto update_sectors;
991		}
992
993		/* no retry any more */
994		len = sectors;
995		space_desired = 1;
996		hint = -1;
997		goto update_sectors;
998	}
999
1000	/* cannot merge and there is space in bad table */
1001	if ((prev + 1) < bb->count &&
1002	    overlap_behind(bb, &bad, prev + 1))
1003		bad.len = min_t(sector_t,
1004				bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
1005
1006	len = insert_at(bb, prev + 1, &bad);
1007	bb->count++;
1008	added++;
1009	hint = prev + 1;
1010
1011update_sectors:
1012	s += len;
1013	sectors -= len;
1014
1015	if (sectors > 0)
1016		goto re_insert;
1017
1018	WARN_ON(sectors < 0);
1019
1020	/*
1021	 * Check whether the following already set range can be
1022	 * merged. (prev < 0) condition is not handled here,
1023	 * because it's already complicated enough.
1024	 */
1025	if (prev >= 0 &&
1026	    (prev + 1) < bb->count &&
1027	    BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
1028	    (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
1029	    BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
1030		p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
1031				  BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
1032				  BB_ACK(p[prev]));
1033
1034		if ((prev + 2) < bb->count)
1035			memmove(p + prev + 1, p + prev + 2,
1036				(bb->count -  (prev + 2)) * 8);
1037		bb->count--;
1038	}
1039
1040	if (space_desired && !badblocks_full(bb)) {
1041		s = orig_start;
1042		sectors = orig_len;
1043		space_desired = 0;
1044		if (retried++ < 3)
1045			goto re_insert;
1046	}
1047
1048out:
1049	if (added) {
1050		set_changed(bb);
1051
1052		if (!acknowledged)
1053			bb->unacked_exist = 1;
1054		else
1055			badblocks_update_acked(bb);
1056	}
1057
1058	write_sequnlock_irqrestore(&bb->lock, flags);
1059
1060	if (!added)
1061		rv = 1;
1062
1063	return rv;
1064}
1065
1066/*
1067 * Clear the bad block range from bad block table which is front overlapped
1068 * with the clearing range. The return value is how many sectors from an
1069 * already set bad block range are cleared. If the whole bad block range is
1070 * covered by the clearing range and fully cleared, 'delete' is set as 1 for
1071 * the caller to reduce bb->count.
1072 */
1073static int front_clear(struct badblocks *bb, int prev,
1074		       struct badblocks_context *bad, int *deleted)
1075{
1076	sector_t sectors = bad->len;
1077	sector_t s = bad->start;
1078	u64 *p = bb->page;
1079	int cleared = 0;
1080
1081	*deleted = 0;
1082	if (s == BB_OFFSET(p[prev])) {
1083		if (BB_LEN(p[prev]) > sectors) {
1084			p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
1085					  BB_LEN(p[prev]) - sectors,
1086					  BB_ACK(p[prev]));
1087			cleared = sectors;
1088		} else {
1089			/* BB_LEN(p[prev]) <= sectors */
1090			cleared = BB_LEN(p[prev]);
1091			if ((prev + 1) < bb->count)
1092				memmove(p + prev, p + prev + 1,
1093				       (bb->count - prev - 1) * 8);
1094			*deleted = 1;
1095		}
1096	} else if (s > BB_OFFSET(p[prev])) {
1097		if (BB_END(p[prev]) <= (s + sectors)) {
1098			cleared = BB_END(p[prev]) - s;
1099			p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
1100					  s - BB_OFFSET(p[prev]),
1101					  BB_ACK(p[prev]));
1102		} else {
1103			/* Splitting is handled in front_splitting_clear() */
1104			BUG();
1105		}
1106	}
1107
1108	return cleared;
1109}
1110
1111/*
1112 * Handle the condition that the clearing range hits middle of an already set
1113 * bad block range from bad block table. In this condition the existing bad
1114 * block range is split into two after the middle part is cleared.
1115 */
1116static int front_splitting_clear(struct badblocks *bb, int prev,
1117				  struct badblocks_context *bad)
1118{
1119	u64 *p = bb->page;
1120	u64 end = BB_END(p[prev]);
1121	int ack = BB_ACK(p[prev]);
1122	sector_t sectors = bad->len;
1123	sector_t s = bad->start;
1124
1125	p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
1126			  s - BB_OFFSET(p[prev]),
1127			  ack);
1128	memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
1129	p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
1130	return sectors;
1131}
1132
1133/* Do the exact work to clear bad block range from the bad block table */
1134static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
1135{
1136	struct badblocks_context bad;
1137	int prev = -1, hint = -1;
1138	int len = 0, cleared = 0;
1139	int rv = 0;
1140	u64 *p;
1141
1142	if (bb->shift < 0)
1143		/* badblocks are disabled */
1144		return 1;
1145
1146	if (sectors == 0)
1147		/* Invalid sectors number */
1148		return 1;
1149
1150	if (bb->shift) {
1151		sector_t target;
1152
1153		/* When clearing we round the start up and the end down.
1154		 * This should not matter as the shift should align with
1155		 * the block size and no rounding should ever be needed.
1156		 * However it is better the think a block is bad when it
1157		 * isn't than to think a block is not bad when it is.
1158		 */
1159		target = s + sectors;
1160		roundup(s, bb->shift);
1161		rounddown(target, bb->shift);
1162		sectors = target - s;
1163	}
1164
1165	write_seqlock_irq(&bb->lock);
1166
1167	bad.ack = true;
1168	p = bb->page;
1169
1170re_clear:
1171	bad.start = s;
1172	bad.len = sectors;
1173
1174	if (badblocks_empty(bb)) {
1175		len = sectors;
1176		cleared++;
1177		goto update_sectors;
1178	}
1179
1180
1181	prev = prev_badblocks(bb, &bad, hint);
1182
1183	/* Start before all badblocks */
1184	if (prev < 0) {
1185		if (overlap_behind(bb, &bad, 0)) {
1186			len = BB_OFFSET(p[0]) - s;
1187			hint = 0;
1188		} else {
1189			len = sectors;
1190		}
1191		/*
1192		 * Both situations are to clear non-bad range,
1193		 * should be treated as successful
1194		 */
1195		cleared++;
1196		goto update_sectors;
1197	}
1198
1199	/* Start after all badblocks */
1200	if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
1201		len = sectors;
1202		cleared++;
1203		goto update_sectors;
1204	}
1205
1206	/* Clear will split a bad record but the table is full */
1207	if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
1208	    (BB_END(p[prev]) > (bad.start + sectors))) {
1209		len = sectors;
1210		goto update_sectors;
1211	}
1212
1213	if (overlap_front(bb, prev, &bad)) {
1214		if ((BB_OFFSET(p[prev]) < bad.start) &&
1215		    (BB_END(p[prev]) > (bad.start + bad.len))) {
1216			/* Splitting */
1217			if ((bb->count + 1) < MAX_BADBLOCKS) {
1218				len = front_splitting_clear(bb, prev, &bad);
1219				bb->count += 1;
1220				cleared++;
1221			} else {
1222				/* No space to split, give up */
1223				len = sectors;
1224			}
1225		} else {
1226			int deleted = 0;
1227
1228			len = front_clear(bb, prev, &bad, &deleted);
1229			bb->count -= deleted;
1230			cleared++;
1231			hint = prev;
1232		}
1233
1234		goto update_sectors;
1235	}
1236
1237	/* Not front overlap, but behind overlap */
1238	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
1239		len = BB_OFFSET(p[prev + 1]) - bad.start;
1240		hint = prev + 1;
1241		/* Clear non-bad range should be treated as successful */
1242		cleared++;
1243		goto update_sectors;
1244	}
1245
1246	/* Not cover any badblocks range in the table */
1247	len = sectors;
1248	/* Clear non-bad range should be treated as successful */
1249	cleared++;
1250
1251update_sectors:
1252	s += len;
1253	sectors -= len;
1254
1255	if (sectors > 0)
1256		goto re_clear;
1257
1258	WARN_ON(sectors < 0);
1259
1260	if (cleared) {
1261		badblocks_update_acked(bb);
1262		set_changed(bb);
1263	}
1264
1265	write_sequnlock_irq(&bb->lock);
1266
1267	if (!cleared)
1268		rv = 1;
1269
1270	return rv;
1271}
1272
1273/* Do the exact work to check bad blocks range from the bad block table */
1274static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
1275			    sector_t *first_bad, int *bad_sectors)
1276{
1277	int unacked_badblocks, acked_badblocks;
1278	int prev = -1, hint = -1, set = 0;
1279	struct badblocks_context bad;
1280	unsigned int seq;
1281	int len, rv;
1282	u64 *p;
1283
1284	WARN_ON(bb->shift < 0 || sectors == 0);
1285
1286	if (bb->shift > 0) {
1287		sector_t target;
1288
1289		/* round the start down, and the end up */
1290		target = s + sectors;
1291		rounddown(s, bb->shift);
1292		roundup(target, bb->shift);
1293		sectors = target - s;
1294	}
1295
1296retry:
1297	seq = read_seqbegin(&bb->lock);
1298
1299	p = bb->page;
1300	unacked_badblocks = 0;
1301	acked_badblocks = 0;
1302
1303re_check:
1304	bad.start = s;
1305	bad.len = sectors;
1306
1307	if (badblocks_empty(bb)) {
1308		len = sectors;
1309		goto update_sectors;
1310	}
1311
1312	prev = prev_badblocks(bb, &bad, hint);
1313
1314	/* start after all badblocks */
1315	if ((prev >= 0) &&
1316	    ((prev + 1) >= bb->count) && !overlap_front(bb, prev, &bad)) {
1317		len = sectors;
1318		goto update_sectors;
1319	}
1320
1321	/* Overlapped with front badblocks record */
1322	if ((prev >= 0) && overlap_front(bb, prev, &bad)) {
1323		if (BB_ACK(p[prev]))
1324			acked_badblocks++;
1325		else
1326			unacked_badblocks++;
1327
1328		if (BB_END(p[prev]) >= (s + sectors))
1329			len = sectors;
1330		else
1331			len = BB_END(p[prev]) - s;
1332
1333		if (set == 0) {
1334			*first_bad = BB_OFFSET(p[prev]);
1335			*bad_sectors = BB_LEN(p[prev]);
1336			set = 1;
1337		}
1338		goto update_sectors;
1339	}
1340
1341	/* Not front overlap, but behind overlap */
1342	if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
1343		len = BB_OFFSET(p[prev + 1]) - bad.start;
1344		hint = prev + 1;
1345		goto update_sectors;
1346	}
1347
1348	/* not cover any badblocks range in the table */
1349	len = sectors;
1350
1351update_sectors:
1352	s += len;
1353	sectors -= len;
1354
1355	if (sectors > 0)
1356		goto re_check;
1357
1358	WARN_ON(sectors < 0);
1359
1360	if (unacked_badblocks > 0)
1361		rv = -1;
1362	else if (acked_badblocks > 0)
1363		rv = 1;
1364	else
1365		rv = 0;
1366
1367	if (read_seqretry(&bb->lock, seq))
1368		goto retry;
1369
1370	return rv;
1371}
1372
1373/**
1374 * badblocks_check() - check a given range for bad sectors
1375 * @bb:		the badblocks structure that holds all badblock information
1376 * @s:		sector (start) at which to check for badblocks
1377 * @sectors:	number of sectors to check for badblocks
1378 * @first_bad:	pointer to store location of the first badblock
1379 * @bad_sectors: pointer to store number of badblocks after @first_bad
1380 *
1381 * We can record which blocks on each device are 'bad' and so just
1382 * fail those blocks, or that stripe, rather than the whole device.
1383 * Entries in the bad-block table are 64bits wide.  This comprises:
1384 * Length of bad-range, in sectors: 0-511 for lengths 1-512
1385 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
1386 *  A 'shift' can be set so that larger blocks are tracked and
1387 *  consequently larger devices can be covered.
1388 * 'Acknowledged' flag - 1 bit. - the most significant bit.
1389 *
1390 * Locking of the bad-block table uses a seqlock so badblocks_check
1391 * might need to retry if it is very unlucky.
1392 * We will sometimes want to check for bad blocks in a bi_end_io function,
1393 * so we use the write_seqlock_irq variant.
1394 *
1395 * When looking for a bad block we specify a range and want to
1396 * know if any block in the range is bad.  So we binary-search
1397 * to the last range that starts at-or-before the given endpoint,
1398 * (or "before the sector after the target range")
1399 * then see if it ends after the given start.
1400 *
1401 * Return:
1402 *  0: there are no known bad blocks in the range
1403 *  1: there are known bad block which are all acknowledged
1404 * -1: there are bad blocks which have not yet been acknowledged in metadata.
1405 * plus the start/length of the first bad section we overlap.
1406 */
1407int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
1408			sector_t *first_bad, int *bad_sectors)
1409{
1410	return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
1411}
1412EXPORT_SYMBOL_GPL(badblocks_check);
1413
1414/**
1415 * badblocks_set() - Add a range of bad blocks to the table.
1416 * @bb:		the badblocks structure that holds all badblock information
1417 * @s:		first sector to mark as bad
1418 * @sectors:	number of sectors to mark as bad
1419 * @acknowledged: weather to mark the bad sectors as acknowledged
1420 *
1421 * This might extend the table, or might contract it if two adjacent ranges
1422 * can be merged. We binary-search to find the 'insertion' point, then
1423 * decide how best to handle it.
1424 *
1425 * Return:
1426 *  0: success
1427 *  1: failed to set badblocks (out of space)
1428 */
1429int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
1430			int acknowledged)
1431{
1432	return _badblocks_set(bb, s, sectors, acknowledged);
1433}
1434EXPORT_SYMBOL_GPL(badblocks_set);
1435
1436/**
1437 * badblocks_clear() - Remove a range of bad blocks to the table.
1438 * @bb:		the badblocks structure that holds all badblock information
1439 * @s:		first sector to mark as bad
1440 * @sectors:	number of sectors to mark as bad
1441 *
1442 * This may involve extending the table if we spilt a region,
1443 * but it must not fail.  So if the table becomes full, we just
1444 * drop the remove request.
1445 *
1446 * Return:
1447 *  0: success
1448 *  1: failed to clear badblocks
1449 */
1450int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
1451{
1452	return _badblocks_clear(bb, s, sectors);
1453}
1454EXPORT_SYMBOL_GPL(badblocks_clear);
1455
1456/**
1457 * ack_all_badblocks() - Acknowledge all bad blocks in a list.
1458 * @bb:		the badblocks structure that holds all badblock information
1459 *
1460 * This only succeeds if ->changed is clear.  It is used by
1461 * in-kernel metadata updates
1462 */
1463void ack_all_badblocks(struct badblocks *bb)
1464{
1465	if (bb->page == NULL || bb->changed)
1466		/* no point even trying */
1467		return;
1468	write_seqlock_irq(&bb->lock);
1469
1470	if (bb->changed == 0 && bb->unacked_exist) {
1471		u64 *p = bb->page;
1472		int i;
1473
1474		for (i = 0; i < bb->count ; i++) {
1475			if (!BB_ACK(p[i])) {
1476				sector_t start = BB_OFFSET(p[i]);
1477				int len = BB_LEN(p[i]);
1478
1479				p[i] = BB_MAKE(start, len, 1);
1480			}
1481		}
1482		bb->unacked_exist = 0;
1483	}
1484	write_sequnlock_irq(&bb->lock);
1485}
1486EXPORT_SYMBOL_GPL(ack_all_badblocks);
1487
1488/**
1489 * badblocks_show() - sysfs access to bad-blocks list
1490 * @bb:		the badblocks structure that holds all badblock information
1491 * @page:	buffer received from sysfs
1492 * @unack:	weather to show unacknowledged badblocks
1493 *
1494 * Return:
1495 *  Length of returned data
1496 */
1497ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
1498{
1499	size_t len;
1500	int i;
1501	u64 *p = bb->page;
1502	unsigned seq;
1503
1504	if (bb->shift < 0)
1505		return 0;
1506
1507retry:
1508	seq = read_seqbegin(&bb->lock);
1509
1510	len = 0;
1511	i = 0;
1512
1513	while (len < PAGE_SIZE && i < bb->count) {
1514		sector_t s = BB_OFFSET(p[i]);
1515		unsigned int length = BB_LEN(p[i]);
1516		int ack = BB_ACK(p[i]);
1517
1518		i++;
1519
1520		if (unack && ack)
1521			continue;
1522
1523		len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
1524				(unsigned long long)s << bb->shift,
1525				length << bb->shift);
1526	}
1527	if (unack && len == 0)
1528		bb->unacked_exist = 0;
1529
1530	if (read_seqretry(&bb->lock, seq))
1531		goto retry;
1532
1533	return len;
1534}
1535EXPORT_SYMBOL_GPL(badblocks_show);
1536
1537/**
1538 * badblocks_store() - sysfs access to bad-blocks list
1539 * @bb:		the badblocks structure that holds all badblock information
1540 * @page:	buffer received from sysfs
1541 * @len:	length of data received from sysfs
1542 * @unack:	weather to show unacknowledged badblocks
1543 *
1544 * Return:
1545 *  Length of the buffer processed or -ve error.
1546 */
1547ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
1548			int unack)
1549{
1550	unsigned long long sector;
1551	int length;
1552	char newline;
1553
1554	switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
1555	case 3:
1556		if (newline != '\n')
1557			return -EINVAL;
1558		fallthrough;
1559	case 2:
1560		if (length <= 0)
1561			return -EINVAL;
1562		break;
1563	default:
1564		return -EINVAL;
1565	}
1566
1567	if (badblocks_set(bb, sector, length, !unack))
1568		return -ENOSPC;
1569	else
1570		return len;
1571}
1572EXPORT_SYMBOL_GPL(badblocks_store);
1573
1574static int __badblocks_init(struct device *dev, struct badblocks *bb,
1575		int enable)
1576{
1577	bb->dev = dev;
1578	bb->count = 0;
1579	if (enable)
1580		bb->shift = 0;
1581	else
1582		bb->shift = -1;
1583	if (dev)
1584		bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
1585	else
1586		bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
1587	if (!bb->page) {
1588		bb->shift = -1;
1589		return -ENOMEM;
1590	}
1591	seqlock_init(&bb->lock);
1592
1593	return 0;
1594}
1595
1596/**
1597 * badblocks_init() - initialize the badblocks structure
1598 * @bb:		the badblocks structure that holds all badblock information
1599 * @enable:	weather to enable badblocks accounting
1600 *
1601 * Return:
1602 *  0: success
1603 *  -ve errno: on error
1604 */
1605int badblocks_init(struct badblocks *bb, int enable)
1606{
1607	return __badblocks_init(NULL, bb, enable);
1608}
1609EXPORT_SYMBOL_GPL(badblocks_init);
1610
1611int devm_init_badblocks(struct device *dev, struct badblocks *bb)
1612{
1613	if (!bb)
1614		return -EINVAL;
1615	return __badblocks_init(dev, bb, 1);
1616}
1617EXPORT_SYMBOL_GPL(devm_init_badblocks);
1618
1619/**
1620 * badblocks_exit() - free the badblocks structure
1621 * @bb:		the badblocks structure that holds all badblock information
1622 */
1623void badblocks_exit(struct badblocks *bb)
1624{
1625	if (!bb)
1626		return;
1627	if (bb->dev)
1628		devm_kfree(bb->dev, bb->page);
1629	else
1630		kfree(bb->page);
1631	bb->page = NULL;
1632}
1633EXPORT_SYMBOL_GPL(badblocks_exit);
1634