1Jason M. Felice wrote: 2 3This patch adds the --link-by-hash=DIR option, which hard links received 4files in a link farm arranged by MD4 file hash. The result is that the system 5will only store one copy of the unique contents of each file, regardless of 6the file's name. 7 8To use this patch, run these commands for a successful build: 9 10 patch -p1 <patches/link-by-hash.diff 11 ./prepare-source 12 ./configure 13 make 14 15--- old/Makefile.in 16+++ new/Makefile.in 17@@ -35,7 +35,7 @@ OBJS1=rsync.o generator.o receiver.o cle 18 main.o checksum.o match.o syscall.o log.o backup.o 19 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \ 20 fileio.o batch.o clientname.o chmod.o 21-OBJS3=progress.o pipe.o 22+OBJS3=progress.o pipe.o hashlink.o 23 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o 24 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \ 25 popt/popthelp.o popt/poptparse.o 26--- old/hashlink.c 27+++ new/hashlink.c 28@@ -0,0 +1,339 @@ 29+/* 30+ Copyright (C) Cronosys, LLC 2004 31+ 32+ This program is free software; you can redistribute it and/or modify 33+ it under the terms of the GNU General Public License as published by 34+ the Free Software Foundation; either version 2 of the License, or 35+ (at your option) any later version. 36+ 37+ This program is distributed in the hope that it will be useful, 38+ but WITHOUT ANY WARRANTY; without even the implied warranty of 39+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 40+ GNU General Public License for more details. 41+ 42+ You should have received a copy of the GNU General Public License 43+ along with this program; if not, write to the Free Software 44+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 45+*/ 46+ 47+/* This file contains code used by the --link-by-hash option. */ 48+ 49+#include "rsync.h" 50+ 51+extern char *link_by_hash_dir; 52+ 53+#if HAVE_LINK 54+ 55+char* make_hash_name(struct file_struct *file) 56+{ 57+ char hash[33], *dst; 58+ unsigned char *src; 59+ unsigned char c; 60+ int i; 61+ 62+ src = (unsigned char*)file->u.sum; 63+ for (dst = hash, i = 0; i < 4; i++, src++) { 64+ c = *src >> 4; 65+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0'); 66+ c = *src & 0x0f; 67+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0'); 68+ } 69+ *dst++ = '/'; 70+ for (i = 0; i < 12; i++, src++) { 71+ c = *src >> 4; 72+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0'); 73+ c = *src & 0x0f; 74+ *(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0'); 75+ } 76+ *dst = 0; 77+ 78+ asprintf(&dst,"%s/%s",link_by_hash_dir,hash); 79+ return dst; 80+} 81+ 82+ 83+void kill_hashfile(struct hashfile_struct *hashfile) 84+{ 85+ if (!hashfile) 86+ return; 87+ free(hashfile->name); 88+ close(hashfile->fd); 89+ free(hashfile); 90+} 91+ 92+ 93+void kill_hashfiles(struct hashfile_struct *hashfiles) 94+{ 95+ struct hashfile_struct *iter, *next; 96+ if ((iter = hashfiles) != NULL) { 97+ do { 98+ next = iter->next; 99+ kill_hashfile(iter); 100+ iter = next; 101+ } while (iter != hashfiles); 102+ } 103+} 104+ 105+ 106+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr) 107+{ 108+ DIR *d; 109+ struct dirent *di; 110+ struct hashfile_struct *hashfiles = NULL, *hashfile; 111+ STRUCT_STAT st; 112+ long this_fnbr; 113+ 114+ *fnbr = 0; 115+ 116+ /* Build a list of potential candidates and open 117+ * them. */ 118+ if ((d = opendir(hashname)) == NULL) { 119+ rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname); 120+ free(hashname); 121+ return NULL; 122+ } 123+ while ((di = readdir(d)) != NULL) { 124+ if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) { 125+ continue; 126+ } 127+ 128+ /* We need to have the largest fnbr in case we need to store 129+ * a new file. */ 130+ this_fnbr = atol(di->d_name); 131+ if (this_fnbr > *fnbr) 132+ *fnbr = this_fnbr; 133+ 134+ hashfile = new_array(struct hashfile_struct, 1); 135+ asprintf(&hashfile->name,"%s/%s",hashname, 136+ di->d_name); 137+ if (do_stat(hashfile->name,&st) == -1) { 138+ rsyserr(FERROR, errno, "stat failed: %s", hashfile->name); 139+ kill_hashfile(hashfile); 140+ continue; 141+ } 142+ if (st.st_size != size) { 143+ kill_hashfile(hashfile); 144+ continue; 145+ } 146+ hashfile->nlink = st.st_nlink; 147+ hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY); 148+ if (hashfile->fd == -1) { 149+ rsyserr(FERROR, errno, "open failed: %s", hashfile->name); 150+ kill_hashfile(hashfile); 151+ continue; 152+ } 153+ if (hashfiles == NULL) 154+ hashfiles = hashfile->next = hashfile->prev = hashfile; 155+ else { 156+ hashfile->next = hashfiles; 157+ hashfile->prev = hashfiles->prev; 158+ hashfile->next->prev = hashfile; 159+ hashfile->prev->next = hashfile; 160+ } 161+ } 162+ closedir(d); 163+ 164+ return hashfiles; 165+} 166+ 167+ 168+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files) 169+{ 170+ int amt, hamt; 171+ char buffer[BUFSIZ], cmpbuffer[BUFSIZ]; 172+ struct hashfile_struct *iter, *next, *best; 173+ uint32 nlink; 174+ 175+ if (!files) 176+ return NULL; 177+ 178+ iter = files; /* in case files are 0 bytes */ 179+ while ((amt = read(fd, buffer, BUFSIZ)) > 0) { 180+ iter = files; 181+ do { 182+ /* Icky bit to resync when we steal the first node. */ 183+ if (!files) 184+ files = iter; 185+ 186+ next = iter->next; 187+ 188+ hamt = read(iter->fd, cmpbuffer, BUFSIZ); 189+ if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) { 190+ if (iter == files) { 191+ files = files->prev; 192+ } 193+ if (iter->next == iter) { 194+ files = next = NULL; 195+ } else { 196+ next = iter->next; 197+ if (iter == files) { 198+ /* So we know to resync */ 199+ files = NULL; 200+ } 201+ } 202+ iter->next->prev = iter->prev; 203+ iter->prev->next = iter->next; 204+ kill_hashfile(iter); 205+ } 206+ 207+ iter = next; 208+ } while (iter != files); 209+ 210+ if (iter == NULL && files == NULL) { 211+ /* There are no matches. */ 212+ return NULL; 213+ } 214+ } 215+ 216+ if (amt == -1) { 217+ rsyserr(FERROR, errno, "read failed in compare_hashfiles()"); 218+ kill_hashfiles(files); 219+ return NULL; 220+ } 221+ 222+ /* If we only have one file left, use it. */ 223+ if (files == files->next) { 224+ return files; 225+ } 226+ 227+ /* All files which remain in the list are identical and should have 228+ * the same size. We pick the one with the lowest link count (we 229+ * may have rolled over because we hit the maximum link count for 230+ * the filesystem). */ 231+ best = iter = files; 232+ nlink = iter->nlink; 233+ do { 234+ if (iter->nlink < nlink) { 235+ nlink = iter->nlink; 236+ best = iter; 237+ } 238+ iter = iter->next; 239+ } while (iter != files); 240+ 241+ best->next->prev = best->prev; 242+ best->prev->next = best->next; 243+ if (files == best) 244+ files = files->next; 245+ kill_hashfiles(files); 246+ return best; 247+} 248+ 249+ 250+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file) 251+{ 252+ STRUCT_STAT st; 253+ char *hashname = make_hash_name(file); 254+ int first = 0, rc; 255+ char *linkname; 256+ long last_fnbr; 257+ 258+ if (file->length == 0) 259+ return robust_rename(fnametmp, fname, NULL, 0644); 260+ 261+ if (do_stat(hashname, &st) == -1) { 262+ char *dirname; 263+ 264+ /* Directory does not exist. */ 265+ dirname = strdup(hashname); 266+ *strrchr(dirname,'/') = 0; 267+ if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) { 268+ rsyserr(FERROR, errno, "mkdir failed: %s", dirname); 269+ free(hashname); 270+ free(dirname); 271+ return robust_rename(fnametmp, fname, NULL, 0644); 272+ } 273+ free(dirname); 274+ 275+ if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) { 276+ rsyserr(FERROR, errno, "mkdir failed: %s", hashname); 277+ free(hashname); 278+ return robust_rename(fnametmp, fname, NULL, 0644); 279+ } 280+ 281+ first = 1; 282+ asprintf(&linkname,"%s/0",hashname); 283+ rprintf(FINFO, "(1) linkname = %s\n", linkname); 284+ } else { 285+ struct hashfile_struct *hashfiles, *hashfile; 286+ 287+ if (do_stat(fnametmp,&st) == -1) { 288+ rsyserr(FERROR, errno, "stat failed: %s", fname); 289+ return -1; 290+ } 291+ hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr); 292+ 293+ if (hashfiles == NULL) { 294+ first = 1; 295+ asprintf(&linkname,"%s/0",hashname); 296+ rprintf(FINFO, "(2) linkname = %s\n", linkname); 297+ } else { 298+ int fd; 299+ /* Search for one identical to us. */ 300+ if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) { 301+ rsyserr(FERROR, errno, "open failed: %s", fnametmp); 302+ kill_hashfiles(hashfiles); 303+ return -1; 304+ } 305+ hashfile = compare_hashfiles(fd, hashfiles); 306+ hashfiles = NULL; 307+ close(fd); 308+ 309+ if (hashfile) { 310+ first = 0; 311+ linkname = strdup(hashfile->name); 312+ rprintf(FINFO, "(3) linkname = %s\n", linkname); 313+ kill_hashfile(hashfile); 314+ } else { 315+ first = 1; 316+ asprintf(&linkname, "%s/%ld", hashname, 317+ last_fnbr + 1); 318+ rprintf(FINFO, "(4) linkname = %s\n", linkname); 319+ } 320+ } 321+ } 322+ 323+ if (!first) { 324+ rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n", 325+ linkname, full_fname(fname)); 326+ robust_unlink(fname); 327+ rc = do_link(linkname, fname); 328+ if (rc == -1) { 329+ if (errno == EMLINK) { 330+ first = 1; 331+ free(linkname); 332+ asprintf(&linkname,"%s/%ld",hashname, 333+ last_fnbr + 1); 334+ rprintf(FINFO, "(5) linkname = %s\n", linkname); 335+ rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname); 336+ } else { 337+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", 338+ linkname, full_fname(fname)); 339+ rc = robust_rename(fnametmp, fname, NULL, 0644); 340+ } 341+ } else { 342+ do_unlink(fnametmp); 343+ } 344+ } 345+ 346+ if (first) { 347+ rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", 348+ full_fname(fname),linkname); 349+ 350+ rc = robust_rename(fnametmp, fname, NULL, 0644); 351+ if (rc != 0) { 352+ rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"", 353+ full_fname(fnametmp), full_fname(fname)); 354+ } 355+ rc = do_link(fname,linkname); 356+ if (rc != 0) { 357+ rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", 358+ full_fname(fname), linkname); 359+ } 360+ } 361+ 362+ free(linkname); 363+ free(hashname); 364+ return rc; 365+} 366+ 367+#endif 368--- old/options.c 369+++ new/options.c 370@@ -145,6 +145,7 @@ char *backup_suffix = NULL; 371 char *tmpdir = NULL; 372 char *partial_dir = NULL; 373 char *basis_dir[MAX_BASIS_DIRS+1]; 374+char *link_by_hash_dir = NULL; 375 char *config_file = NULL; 376 char *shell_cmd = NULL; 377 char *logfile_name = NULL; 378@@ -349,6 +350,7 @@ void usage(enum logcode F) 379 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n"); 380 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n"); 381 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n"); 382+ rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n"); 383 rprintf(F," -z, --compress compress file data during the transfer\n"); 384 rprintf(F," --compress-level=NUM explicitly set compression level\n"); 385 rprintf(F," -C, --cvs-exclude auto-ignore files the same way CVS does\n"); 386@@ -398,7 +400,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OP 387 OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP, 388 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD, 389 OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE, 390- OPT_NO_D, 391+ OPT_NO_D, OPT_LINK_BY_HASH, 392 OPT_SERVER, OPT_REFUSED_BASE = 9000}; 393 394 static struct poptOption long_options[] = { 395@@ -499,6 +501,7 @@ static struct poptOption long_options[] 396 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 }, 397 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 }, 398 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 }, 399+ {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0}, 400 {"fuzzy", 'y', POPT_ARG_NONE, &fuzzy_basis, 0, 0, 0 }, 401 {"compress", 'z', POPT_ARG_NONE, 0, 'z', 0, 0 }, 402 {"compress-level", 0, POPT_ARG_INT, &def_compress_level, 'z', 0, 0 }, 403@@ -1089,6 +1092,21 @@ int parse_arguments(int *argc, const cha 404 usage(FINFO); 405 exit_cleanup(0); 406 407+ case OPT_LINK_BY_HASH: 408+#if HAVE_LINK 409+ arg = poptGetOptArg(pc); 410+ if (sanitize_paths) 411+ arg = sanitize_path(NULL, arg, NULL, 0, NULL); 412+ link_by_hash_dir = (char *)arg; 413+ break; 414+#else 415+ snprintf(err_buf, sizeof err_buf, 416+ "hard links are not supported on this %s\n", 417+ am_server ? "server" : "client"); 418+ rprintf(FERROR, "ERROR: %s", err_buf); 419+ return 0; 420+#endif 421+ 422 default: 423 /* A large opt value means that set_refuse_options() 424 * turned this option off. */ 425@@ -1739,6 +1757,11 @@ void server_options(char **args,int *arg 426 } 427 } 428 429+ if (link_by_hash_dir && am_sender) { 430+ args[ac++] = "--link-by-hash"; 431+ args[ac++] = link_by_hash_dir; 432+ } 433+ 434 if (files_from && (!am_sender || filesfrom_host)) { 435 if (filesfrom_host) { 436 args[ac++] = "--files-from"; 437--- old/receiver.c 438+++ new/receiver.c 439@@ -50,6 +50,7 @@ extern int delay_updates; 440 extern struct stats stats; 441 extern char *stdout_format; 442 extern char *tmpdir; 443+extern char *link_by_hash_dir; 444 extern char *partial_dir; 445 extern char *basis_dir[]; 446 extern struct file_list *the_file_list; 447@@ -124,12 +125,13 @@ static int get_tmpname(char *fnametmp, c 448 449 450 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r, 451- char *fname, int fd, OFF_T total_size) 452+ char *fname, int fd, OFF_T total_size, char *md4) 453 { 454 static char file_sum1[MD4_SUM_LENGTH]; 455 static char file_sum2[MD4_SUM_LENGTH]; 456 struct map_struct *mapbuf; 457 struct sum_struct sum; 458+ struct mdfour mdfour_data; 459 int32 len; 460 OFF_T offset = 0; 461 OFF_T offset2; 462@@ -149,6 +151,9 @@ static int receive_data(int f_in, char * 463 } else 464 mapbuf = NULL; 465 466+ if (md4) 467+ mdfour_begin(&mdfour_data); 468+ 469 sum_init(checksum_seed); 470 471 if (append_mode) { 472@@ -191,6 +196,8 @@ static int receive_data(int f_in, char * 473 cleanup_got_literal = 1; 474 475 sum_update(data, i); 476+ if (md4) 477+ mdfour_update(&mdfour_data, (uchar*)data, i); 478 479 if (fd != -1 && write_file(fd,data,i) != i) 480 goto report_write_error; 481@@ -217,6 +224,8 @@ static int receive_data(int f_in, char * 482 483 see_token(map, len); 484 sum_update(map, len); 485+ if (md4) 486+ mdfour_update(&mdfour_data, (uchar*)map, len); 487 } 488 489 if (updating_basis) { 490@@ -259,6 +268,8 @@ static int receive_data(int f_in, char * 491 } 492 493 sum_end(file_sum1); 494+ if (md4) 495+ mdfour_result(&mdfour_data, (unsigned char*)md4); 496 497 if (mapbuf) 498 unmap_file(mapbuf); 499@@ -274,7 +285,7 @@ static int receive_data(int f_in, char * 500 501 static void discard_receive_data(int f_in, OFF_T length) 502 { 503- receive_data(f_in, NULL, -1, 0, NULL, -1, length); 504+ receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL); 505 } 506 507 static void handle_delayed_updates(struct file_list *flist, char *local_name) 508@@ -611,8 +622,12 @@ int recv_files(int f_in, struct file_lis 509 rprintf(FINFO, "%s\n", fname); 510 511 /* recv file data */ 512+#if HAVE_LINK 513+ if (link_by_hash_dir) 514+ file->u.sum = new_array(char, MD4_SUM_LENGTH); 515+#endif 516 recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size, 517- fname, fd2, file->length); 518+ fname, fd2, file->length, file->u.sum); 519 520 log_item(log_code, file, &initial_stats, iflags, NULL); 521 522--- old/rsync.c 523+++ new/rsync.c 524@@ -48,6 +48,7 @@ extern int inplace; 525 extern int keep_dirlinks; 526 extern int make_backups; 527 extern mode_t orig_umask; 528+extern char *link_by_hash_dir; 529 extern struct stats stats; 530 extern struct chmod_mode_struct *daemon_chmod_modes; 531 532@@ -271,8 +272,15 @@ void finish_transfer(char *fname, char * 533 /* move tmp file over real file */ 534 if (verbose > 2) 535 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname); 536- ret = robust_rename(fnametmp, fname, partialptr, 537- file->mode & INITACCESSPERMS); 538+#if HAVE_LINK 539+ if (link_by_hash_dir) 540+ ret = link_by_hash(fnametmp, fname, file); 541+ else 542+#endif 543+ { 544+ ret = robust_rename(fnametmp, fname, partialptr, 545+ file->mode & INITACCESSPERMS); 546+ } 547 if (ret < 0) { 548 rsyserr(FERROR, errno, "%s %s -> \"%s\"", 549 ret == -2 ? "copy" : "rename", 550--- old/rsync.h 551+++ new/rsync.h 552@@ -651,6 +651,14 @@ struct stats { 553 int current_file_index; 554 }; 555 556+struct hashfile_struct { 557+ struct hashfile_struct *next; 558+ struct hashfile_struct *prev; 559+ char *name; 560+ int fd; 561+ uint32 nlink; 562+}; 563+ 564 struct chmod_mode_struct; 565 566 #include "byteorder.h" 567--- old/rsync.yo 568+++ new/rsync.yo 569@@ -366,6 +366,7 @@ to the detailed description below for a 570 --compare-dest=DIR also compare received files relative to DIR 571 --copy-dest=DIR ... and include copies of unchanged files 572 --link-dest=DIR hardlink to files in DIR when unchanged 573+ --link-by-hash=DIR create hardlinks by hash into DIR 574 -z, --compress compress file data during the transfer 575 --compress-level=NUM explicitly set compression level 576 -C, --cvs-exclude auto-ignore files in the same way CVS does 577--- old/proto.h 578+++ new/proto.h 579@@ -89,6 +89,11 @@ void itemize(struct file_struct *file, i 580 int unchanged_file(char *fn, struct file_struct *file, STRUCT_STAT *st); 581 void check_for_finished_hlinks(int itemizing, enum logcode code); 582 void generate_files(int f_out, struct file_list *flist, char *local_name); 583+void kill_hashfile(struct hashfile_struct *hashfile); 584+void kill_hashfiles(struct hashfile_struct *hashfiles); 585+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr); 586+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files); 587+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file); 588 void init_hard_links(void); 589 int hard_link_check(struct file_struct *file, int ndx, char *fname, 590 int statret, STRUCT_STAT *st, int itemizing, 591--- old/rsync.1 592+++ new/rsync.1 593@@ -432,6 +432,7 @@ to the detailed description below for a 594 \-\-compare\-dest=DIR also compare received files relative to DIR 595 \-\-copy\-dest=DIR \&.\&.\&. and include copies of unchanged files 596 \-\-link\-dest=DIR hardlink to files in DIR when unchanged 597+ \-\-link\-by\-hash=DIR create hardlinks by hash into DIR 598 \-z, \-\-compress compress file data during the transfer 599 \-\-compress\-level=NUM explicitly set compression level 600 \-C, \-\-cvs\-exclude auto-ignore files in the same way CVS does 601