1Jason M. Felice wrote:
2
3This patch adds the --link-by-hash=DIR option, which hard links received
4files in a link farm arranged by MD4 file hash.  The result is that the system
5will only store one copy of the unique contents of each file, regardless of
6the file's name.
7
8To use this patch, run these commands for a successful build:
9
10    patch -p1 <patches/link-by-hash.diff
11    ./prepare-source
12    ./configure
13    make
14
15--- old/Makefile.in
16+++ new/Makefile.in
17@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle
18 	main.o checksum.o match.o syscall.o log.o backup.o
19 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
20 	fileio.o batch.o clientname.o chmod.o
21-OBJS3=progress.o pipe.o
22+OBJS3=progress.o pipe.o hashlink.o
23 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
24 popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
25 	popt/popthelp.o popt/poptparse.o
26--- old/hashlink.c
27+++ new/hashlink.c
28@@ -0,0 +1,339 @@
29+/*
30+   Copyright (C) Cronosys, LLC 2004
31+
32+   This program is free software; you can redistribute it and/or modify
33+   it under the terms of the GNU General Public License as published by
34+   the Free Software Foundation; either version 2 of the License, or
35+   (at your option) any later version.
36+
37+   This program is distributed in the hope that it will be useful,
38+   but WITHOUT ANY WARRANTY; without even the implied warranty of
39+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
40+   GNU General Public License for more details.
41+
42+   You should have received a copy of the GNU General Public License
43+   along with this program; if not, write to the Free Software
44+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
45+*/
46+
47+/* This file contains code used by the --link-by-hash option. */
48+
49+#include "rsync.h"
50+
51+extern char *link_by_hash_dir;
52+
53+#if HAVE_LINK
54+
55+char* make_hash_name(struct file_struct *file)
56+{
57+	char hash[33], *dst;
58+	unsigned char *src;
59+	unsigned char c;
60+	int i;
61+
62+	src = (unsigned char*)file->u.sum;
63+	for (dst = hash, i = 0; i < 4; i++, src++) {
64+		c = *src >> 4;
65+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
66+		c = *src & 0x0f;
67+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
68+	}
69+	*dst++ = '/';
70+	for (i = 0; i < 12; i++, src++) {
71+		c = *src >> 4;
72+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
73+		c = *src & 0x0f;
74+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
75+	}
76+	*dst = 0;
77+
78+	asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
79+	return dst;
80+}
81+
82+
83+void kill_hashfile(struct hashfile_struct *hashfile)
84+{
85+	if (!hashfile)
86+		return;
87+	free(hashfile->name);
88+	close(hashfile->fd);
89+	free(hashfile);
90+}
91+
92+
93+void kill_hashfiles(struct hashfile_struct *hashfiles)
94+{
95+	struct hashfile_struct *iter, *next;
96+	if ((iter = hashfiles) != NULL) {
97+		do {
98+			next = iter->next;
99+			kill_hashfile(iter);
100+			iter = next;
101+		} while (iter != hashfiles);
102+	}
103+}
104+
105+
106+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
107+{
108+	DIR *d;
109+	struct dirent *di;
110+	struct hashfile_struct *hashfiles = NULL, *hashfile;
111+	STRUCT_STAT st;
112+	long this_fnbr;
113+
114+	*fnbr = 0;
115+
116+	/* Build a list of potential candidates and open
117+	 * them. */
118+	if ((d = opendir(hashname)) == NULL) {
119+		rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
120+		free(hashname);
121+		return NULL;
122+	}
123+	while ((di = readdir(d)) != NULL) {
124+		if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
125+			continue;
126+		}
127+
128+		/* We need to have the largest fnbr in case we need to store
129+		 * a new file. */
130+		this_fnbr = atol(di->d_name);
131+		if (this_fnbr > *fnbr)
132+			*fnbr = this_fnbr;
133+
134+		hashfile = new_array(struct hashfile_struct, 1);
135+		asprintf(&hashfile->name,"%s/%s",hashname,
136+			 di->d_name);
137+		if (do_stat(hashfile->name,&st) == -1) {
138+			rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
139+			kill_hashfile(hashfile);
140+			continue;
141+		}
142+		if (st.st_size != size) {
143+			kill_hashfile(hashfile);
144+			continue;
145+		}
146+		hashfile->nlink = st.st_nlink;
147+		hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
148+		if (hashfile->fd == -1) {
149+			rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
150+			kill_hashfile(hashfile);
151+			continue;
152+		}
153+		if (hashfiles == NULL)
154+			hashfiles = hashfile->next = hashfile->prev = hashfile;
155+		else {
156+			hashfile->next = hashfiles;
157+			hashfile->prev = hashfiles->prev;
158+			hashfile->next->prev = hashfile;
159+			hashfile->prev->next = hashfile;
160+		}
161+	}
162+	closedir(d);
163+
164+	return hashfiles;
165+}
166+
167+
168+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
169+{
170+	int amt, hamt;
171+	char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
172+	struct hashfile_struct *iter, *next, *best;
173+	uint32 nlink;
174+
175+	if (!files)
176+		return NULL;
177+
178+	iter = files; /* in case files are 0 bytes */
179+	while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
180+		iter = files;
181+		do {
182+			/* Icky bit to resync when we steal the first node. */
183+			if (!files)
184+				files = iter;
185+
186+			next = iter->next;
187+
188+			hamt = read(iter->fd, cmpbuffer, BUFSIZ);
189+			if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
190+				if (iter == files) {
191+					files = files->prev;
192+				}
193+				if (iter->next == iter) {
194+					files = next = NULL;
195+				} else {
196+					next = iter->next;
197+					if (iter == files) {
198+						/* So we know to resync */
199+						files = NULL;
200+					}
201+				}
202+				iter->next->prev = iter->prev;
203+				iter->prev->next = iter->next;
204+				kill_hashfile(iter);
205+			}
206+
207+			iter = next;
208+		} while (iter != files);
209+
210+		if (iter == NULL && files == NULL) {
211+			/* There are no matches. */
212+			return NULL;
213+		}
214+	}
215+
216+	if (amt == -1) {
217+		rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
218+		kill_hashfiles(files);
219+		return NULL;
220+	}
221+
222+	/* If we only have one file left, use it. */
223+	if (files == files->next) {
224+		return files;
225+	}
226+
227+	/* All files which remain in the list are identical and should have
228+	 * the same size.  We pick the one with the lowest link count (we
229+	 * may have rolled over because we hit the maximum link count for
230+	 * the filesystem). */
231+	best = iter = files;
232+	nlink = iter->nlink;
233+	do {
234+		if (iter->nlink < nlink) {
235+			nlink = iter->nlink;
236+			best = iter;
237+		}
238+		iter = iter->next;
239+	} while (iter != files);
240+
241+	best->next->prev = best->prev;
242+	best->prev->next = best->next;
243+	if (files == best)
244+		files = files->next;
245+	kill_hashfiles(files);
246+	return best;
247+}
248+
249+
250+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
251+{
252+	STRUCT_STAT st;
253+	char *hashname = make_hash_name(file);
254+	int first = 0, rc;
255+	char *linkname;
256+	long last_fnbr;
257+
258+	if (file->length == 0)
259+		return robust_rename(fnametmp, fname, NULL, 0644);
260+
261+	if (do_stat(hashname, &st) == -1) {
262+		char *dirname;
263+
264+		/* Directory does not exist. */
265+		dirname = strdup(hashname);
266+		*strrchr(dirname,'/') = 0;
267+		if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
268+			rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
269+			free(hashname);
270+			free(dirname);
271+			return robust_rename(fnametmp, fname, NULL, 0644);
272+		}
273+		free(dirname);
274+
275+		if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
276+			rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
277+			free(hashname);
278+			return robust_rename(fnametmp, fname, NULL, 0644);
279+		}
280+
281+		first = 1;
282+		asprintf(&linkname,"%s/0",hashname);
283+		rprintf(FINFO, "(1) linkname = %s\n", linkname);
284+	} else {
285+		struct hashfile_struct *hashfiles, *hashfile;
286+
287+		if (do_stat(fnametmp,&st) == -1) {
288+			rsyserr(FERROR, errno, "stat failed: %s", fname);
289+			return -1;
290+		}
291+		hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
292+
293+		if (hashfiles == NULL) {
294+			first = 1;
295+			asprintf(&linkname,"%s/0",hashname);
296+			rprintf(FINFO, "(2) linkname = %s\n", linkname);
297+		} else {
298+			int fd;
299+			/* Search for one identical to us. */
300+			if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
301+				rsyserr(FERROR, errno, "open failed: %s", fnametmp);
302+				kill_hashfiles(hashfiles);
303+				return -1;
304+			}
305+			hashfile = compare_hashfiles(fd, hashfiles);
306+			hashfiles = NULL;
307+			close(fd);
308+
309+			if (hashfile) {
310+				first = 0;
311+				linkname = strdup(hashfile->name);
312+				rprintf(FINFO, "(3) linkname = %s\n", linkname);
313+				kill_hashfile(hashfile);
314+			} else {
315+				first = 1;
316+				asprintf(&linkname, "%s/%ld", hashname,
317+					 last_fnbr + 1);
318+				rprintf(FINFO, "(4) linkname = %s\n", linkname);
319+			}
320+		}
321+	}
322+
323+	if (!first) {
324+		rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
325+				linkname, full_fname(fname));
326+		robust_unlink(fname);
327+		rc = do_link(linkname, fname);
328+		if (rc == -1) {
329+			if (errno == EMLINK) {
330+				first = 1;
331+				free(linkname);
332+				asprintf(&linkname,"%s/%ld",hashname,
333+					 last_fnbr + 1);
334+				rprintf(FINFO, "(5) linkname = %s\n", linkname);
335+				rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
336+			} else {
337+				rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
338+					linkname, full_fname(fname));
339+				rc = robust_rename(fnametmp, fname, NULL, 0644);
340+			}
341+		} else {
342+			do_unlink(fnametmp);
343+		}
344+	}
345+
346+	if (first) {
347+		rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
348+				full_fname(fname),linkname);
349+
350+		rc = robust_rename(fnametmp, fname, NULL, 0644);
351+		if (rc != 0) {
352+			rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
353+				full_fname(fnametmp), full_fname(fname));
354+		}
355+		rc = do_link(fname,linkname);
356+		if (rc != 0) {
357+			rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
358+				full_fname(fname), linkname);
359+		}
360+	}
361+
362+	free(linkname);
363+	free(hashname);
364+	return rc;
365+}
366+
367+#endif
368--- old/options.c
369+++ new/options.c
370@@ -145,6 +145,7 @@ char *backup_suffix = NULL;
371 char *tmpdir = NULL;
372 char *partial_dir = NULL;
373 char *basis_dir[MAX_BASIS_DIRS+1];
374+char *link_by_hash_dir = NULL;
375 char *config_file = NULL;
376 char *shell_cmd = NULL;
377 char *logfile_name = NULL;
378@@ -349,6 +350,7 @@ void usage(enum logcode F)
379   rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
380   rprintf(F,"     --copy-dest=DIR         ... and include copies of unchanged files\n");
381   rprintf(F,"     --link-dest=DIR         hardlink to files in DIR when unchanged\n");
382+  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash into DIR\n");
383   rprintf(F," -z, --compress              compress file data during the transfer\n");
384   rprintf(F,"     --compress-level=NUM    explicitly set compression level\n");
385   rprintf(F," -C, --cvs-exclude           auto-ignore files the same way CVS does\n");
386@@ -398,7 +400,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP
387       OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
388       OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
389       OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
390-      OPT_NO_D,
391+      OPT_NO_D, OPT_LINK_BY_HASH,
392       OPT_SERVER, OPT_REFUSED_BASE = 9000};
393 
394 static struct poptOption long_options[] = {
395@@ -499,6 +501,7 @@ static struct poptOption long_options[] 
396   {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
397   {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
398   {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
399+  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
400   {"fuzzy",           'y', POPT_ARG_NONE,   &fuzzy_basis, 0, 0, 0 },
401   {"compress",        'z', POPT_ARG_NONE,   0, 'z', 0, 0 },
402   {"compress-level",   0,  POPT_ARG_INT,    &def_compress_level, 'z', 0, 0 },
403@@ -1089,6 +1092,21 @@ int parse_arguments(int *argc, const cha
404 			usage(FINFO);
405 			exit_cleanup(0);
406 
407+                case OPT_LINK_BY_HASH:
408+#if HAVE_LINK
409+			arg = poptGetOptArg(pc);
410+			if (sanitize_paths)
411+				arg = sanitize_path(NULL, arg, NULL, 0, NULL);
412+			link_by_hash_dir = (char *)arg;
413+			break;
414+#else
415+			snprintf(err_buf, sizeof err_buf,
416+				 "hard links are not supported on this %s\n",
417+				 am_server ? "server" : "client");
418+			rprintf(FERROR, "ERROR: %s", err_buf);
419+			return 0;
420+#endif
421+
422 		default:
423 			/* A large opt value means that set_refuse_options()
424 			 * turned this option off. */
425@@ -1739,6 +1757,11 @@ void server_options(char **args,int *arg
426 		}
427 	}
428 
429+	if (link_by_hash_dir && am_sender) {
430+		args[ac++] = "--link-by-hash";
431+		args[ac++] = link_by_hash_dir;
432+	}
433+
434 	if (files_from && (!am_sender || filesfrom_host)) {
435 		if (filesfrom_host) {
436 			args[ac++] = "--files-from";
437--- old/receiver.c
438+++ new/receiver.c
439@@ -50,6 +50,7 @@ extern int delay_updates;
440 extern struct stats stats;
441 extern char *stdout_format;
442 extern char *tmpdir;
443+extern char *link_by_hash_dir;
444 extern char *partial_dir;
445 extern char *basis_dir[];
446 extern struct file_list *the_file_list;
447@@ -124,12 +125,13 @@ static int get_tmpname(char *fnametmp, c
448 
449 
450 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
451-			char *fname, int fd, OFF_T total_size)
452+			char *fname, int fd, OFF_T total_size, char *md4)
453 {
454 	static char file_sum1[MD4_SUM_LENGTH];
455 	static char file_sum2[MD4_SUM_LENGTH];
456 	struct map_struct *mapbuf;
457 	struct sum_struct sum;
458+	struct mdfour mdfour_data;
459 	int32 len;
460 	OFF_T offset = 0;
461 	OFF_T offset2;
462@@ -149,6 +151,9 @@ static int receive_data(int f_in, char *
463 	} else
464 		mapbuf = NULL;
465 
466+	if (md4)
467+		mdfour_begin(&mdfour_data);
468+
469 	sum_init(checksum_seed);
470 
471 	if (append_mode) {
472@@ -191,6 +196,8 @@ static int receive_data(int f_in, char *
473 			cleanup_got_literal = 1;
474 
475 			sum_update(data, i);
476+			if (md4)
477+				mdfour_update(&mdfour_data, (uchar*)data, i);
478 
479 			if (fd != -1 && write_file(fd,data,i) != i)
480 				goto report_write_error;
481@@ -217,6 +224,8 @@ static int receive_data(int f_in, char *
482 
483 			see_token(map, len);
484 			sum_update(map, len);
485+			if (md4)
486+				mdfour_update(&mdfour_data, (uchar*)map, len);
487 		}
488 
489 		if (updating_basis) {
490@@ -259,6 +268,8 @@ static int receive_data(int f_in, char *
491 	}
492 
493 	sum_end(file_sum1);
494+	if (md4)
495+		mdfour_result(&mdfour_data, (unsigned char*)md4);
496 
497 	if (mapbuf)
498 		unmap_file(mapbuf);
499@@ -274,7 +285,7 @@ static int receive_data(int f_in, char *
500 
501 static void discard_receive_data(int f_in, OFF_T length)
502 {
503-	receive_data(f_in, NULL, -1, 0, NULL, -1, length);
504+	receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
505 }
506 
507 static void handle_delayed_updates(struct file_list *flist, char *local_name)
508@@ -611,8 +622,12 @@ int recv_files(int f_in, struct file_lis
509 			rprintf(FINFO, "%s\n", fname);
510 
511 		/* recv file data */
512+#if HAVE_LINK
513+		if (link_by_hash_dir)
514+			file->u.sum = new_array(char, MD4_SUM_LENGTH);
515+#endif
516 		recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
517-				       fname, fd2, file->length);
518+				       fname, fd2, file->length, file->u.sum);
519 
520 		log_item(log_code, file, &initial_stats, iflags, NULL);
521 
522--- old/rsync.c
523+++ new/rsync.c
524@@ -48,6 +48,7 @@ extern int inplace;
525 extern int keep_dirlinks;
526 extern int make_backups;
527 extern mode_t orig_umask;
528+extern char *link_by_hash_dir;
529 extern struct stats stats;
530 extern struct chmod_mode_struct *daemon_chmod_modes;
531 
532@@ -271,8 +272,15 @@ void finish_transfer(char *fname, char *
533 	/* move tmp file over real file */
534 	if (verbose > 2)
535 		rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
536-	ret = robust_rename(fnametmp, fname, partialptr,
537-			    file->mode & INITACCESSPERMS);
538+#if HAVE_LINK
539+	if (link_by_hash_dir)
540+		ret = link_by_hash(fnametmp, fname, file);
541+	else
542+#endif
543+	{
544+		ret = robust_rename(fnametmp, fname, partialptr,
545+				    file->mode & INITACCESSPERMS);
546+	}
547 	if (ret < 0) {
548 		rsyserr(FERROR, errno, "%s %s -> \"%s\"",
549 			ret == -2 ? "copy" : "rename",
550--- old/rsync.h
551+++ new/rsync.h
552@@ -651,6 +651,14 @@ struct stats {
553 	int current_file_index;
554 };
555 
556+struct hashfile_struct {
557+	struct hashfile_struct *next;
558+	struct hashfile_struct *prev;
559+	char *name;
560+	int fd;
561+	uint32 nlink;
562+};
563+
564 struct chmod_mode_struct;
565 
566 #include "byteorder.h"
567--- old/rsync.yo
568+++ new/rsync.yo
569@@ -366,6 +366,7 @@ to the detailed description below for a 
570      --compare-dest=DIR      also compare received files relative to DIR
571      --copy-dest=DIR         ... and include copies of unchanged files
572      --link-dest=DIR         hardlink to files in DIR when unchanged
573+     --link-by-hash=DIR      create hardlinks by hash into DIR
574  -z, --compress              compress file data during the transfer
575      --compress-level=NUM    explicitly set compression level
576  -C, --cvs-exclude           auto-ignore files in the same way CVS does
577--- old/proto.h
578+++ new/proto.h
579@@ -89,6 +89,11 @@ void itemize(struct file_struct *file, i
580 int unchanged_file(char *fn, struct file_struct *file, STRUCT_STAT *st);
581 void check_for_finished_hlinks(int itemizing, enum logcode code);
582 void generate_files(int f_out, struct file_list *flist, char *local_name);
583+void kill_hashfile(struct hashfile_struct *hashfile);
584+void kill_hashfiles(struct hashfile_struct *hashfiles);
585+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr);
586+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files);
587+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file);
588 void init_hard_links(void);
589 int hard_link_check(struct file_struct *file, int ndx, char *fname,
590 		    int statret, STRUCT_STAT *st, int itemizing,
591--- old/rsync.1
592+++ new/rsync.1
593@@ -432,6 +432,7 @@ to the detailed description below for a 
594      \-\-compare\-dest=DIR      also compare received files relative to DIR
595      \-\-copy\-dest=DIR         \&.\&.\&. and include copies of unchanged files
596      \-\-link\-dest=DIR         hardlink to files in DIR when unchanged
597+     \-\-link\-by\-hash=DIR      create hardlinks by hash into DIR
598  \-z, \-\-compress              compress file data during the transfer
599      \-\-compress\-level=NUM    explicitly set compression level
600  \-C, \-\-cvs\-exclude           auto-ignore files in the same way CVS does
601