After applying this patch and running configure, you MUST run this
command before "make":

    make proto

Jason M. Felice writes:

This patch adds the --link-by-hash=DIR option, which hard links received
files in a link farm arranged by MD4 file hash.  The result is that the system
will only store one copy of the unique contents of each file, regardless of
the file's name.


--- orig/Makefile.in	2005-07-07 21:29:57
+++ Makefile.in	2004-07-03 20:20:15
@@ -34,7 +34,7 @@ OBJS1=rsync.o generator.o receiver.o cle
 	main.o checksum.o match.o syscall.o log.o backup.o
 OBJS2=options.o flist.o io.o compat.o hlink.o token.o uidlist.o socket.o \
 	fileio.o batch.o clientname.o
-OBJS3=progress.o pipe.o
+OBJS3=progress.o pipe.o hashlink.o
 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
 popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
 	popt/popthelp.o popt/poptparse.o
--- orig/hashlink.c	2004-09-24 16:44:25
+++ hashlink.c	2004-09-24 16:44:25
@@ -0,0 +1,340 @@
+/*
+   Copyright (C) Cronosys, LLC 2004
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+/* This file contains code used by the --link-by-hash option. */
+
+#include "rsync.h"
+
+extern char *link_by_hash_dir;
+
+#if HAVE_LINK
+
+char* make_hash_name(struct file_struct *file)
+{
+	char hash[33], *dst;
+	unsigned char *src;
+	unsigned char c;
+	int i;
+
+	src = (unsigned char*)file->u.sum;
+	for (dst = hash, i = 0; i < 4; i++, src++) {
+		c = *src >> 4;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+		c = *src & 0x0f;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+	}
+	*dst++ = '/';
+	for (i = 0; i < 12; i++, src++) {
+		c = *src >> 4;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+		c = *src & 0x0f;
+		*(dst++) = (c >= 10) ? (c - 10 + 'a') : (c + '0');
+	}
+	*dst = 0;
+
+	asprintf(&dst,"%s/%s",link_by_hash_dir,hash);
+	return dst;
+}
+
+
+void kill_hashfile(struct hashfile_struct *hashfile)
+{
+	if (!hashfile)
+		return;
+	free(hashfile->name);
+	close(hashfile->fd);
+	free(hashfile);
+}
+
+
+void kill_hashfiles(struct hashfile_struct *hashfiles)
+{
+	struct hashfile_struct *iter, *next;
+	if ((iter = hashfiles) != NULL) {
+		do {
+			next = iter->next;
+			kill_hashfile(iter);
+			iter = next;
+		} while (iter != hashfiles);
+	}
+}
+
+
+struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
+{
+	DIR *d;
+	struct dirent *di;
+	struct hashfile_struct *hashfiles = NULL, *hashfile;
+	STRUCT_STAT st;
+	long this_fnbr;
+
+	*fnbr = 0;
+
+	/* Build a list of potential candidates and open
+	 * them. */
+	if ((d = opendir(hashname)) == NULL) {
+		rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
+		free(hashname);
+		return NULL;
+	}
+	while ((di = readdir(d)) != NULL) {
+		if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
+			continue;
+		}
+
+		/* We need to have the largest fnbr in case we need to store
+		 * a new file. */
+		this_fnbr = atol(di->d_name);
+		if (this_fnbr > *fnbr)
+			*fnbr = this_fnbr;
+
+		hashfile = new_array(struct hashfile_struct, 1);
+		asprintf(&hashfile->name,"%s/%s",hashname,
+			 di->d_name);
+		if (do_stat(hashfile->name,&st) == -1) {
+			rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
+			kill_hashfile(hashfile);
+			continue;
+		}
+		if (st.st_size != size) {
+			kill_hashfile(hashfile);
+			continue;
+		}
+		hashfile->nlink = st.st_nlink;
+		hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
+		if (hashfile->fd == -1) {
+			rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
+			kill_hashfile(hashfile);
+			continue;
+		}
+		if (hashfiles == NULL)
+			hashfiles = hashfile->next = hashfile->prev = hashfile;
+		else {
+			hashfile->next = hashfiles;
+			hashfile->prev = hashfiles->prev;
+			hashfile->next->prev = hashfile;
+			hashfile->prev->next = hashfile;
+		}
+	}
+	closedir(d);
+
+	return hashfiles;
+}
+
+
+struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
+{
+	int amt, hamt;
+	char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
+	struct hashfile_struct *iter, *next, *best;
+	uint32 nlink;
+
+	if (!files)
+		return NULL;
+
+	iter = files; /* in case files are 0 bytes */
+	while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
+		iter = files;
+		do {
+			/* Icky bit to resync when we steal the first node. */
+			if (!files)
+				files = iter;
+
+			next = iter->next;
+
+			hamt = read(iter->fd, cmpbuffer, BUFSIZ);
+			if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
+				if (iter == files) {
+					files = files->prev;
+				}
+				if (iter->next == iter) {
+					files = next = NULL;
+				} else {
+					next = iter->next;
+					if (iter == files) {
+						/* So we know to resync */
+						files = NULL;
+					}
+				}
+				iter->next->prev = iter->prev;
+				iter->prev->next = iter->next;
+				kill_hashfile(iter);
+			}
+
+			iter = next;
+		} while (iter != files);
+
+		if (iter == NULL && files == NULL) {
+			/* There are no matches. */
+			return NULL;
+		}
+	}
+
+	if (amt == -1) {
+		rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
+		kill_hashfiles(files);
+		return NULL;
+	}
+
+	/* If we only have one file left, use it. */
+	if (files == files->next) {
+		return files;
+	}
+
+	/* All files which remain in the list are identical and should have
+	 * the same size.  We pick the one with the lowest link count (we
+	 * may have rolled over because we hit the maximum link count for
+	 * the filesystem). */
+	best = iter = files;
+	nlink = iter->nlink;
+	do {
+		if (iter->nlink < nlink) {
+			nlink = iter->nlink;
+			best = iter;
+		}
+		iter = iter->next;
+	} while (iter != files);
+
+	best->next->prev = best->prev;
+	best->prev->next = best->next;
+	if (files == best)
+		files = files->next;
+	kill_hashfiles(files);
+	return best;
+}
+
+
+int link_by_hash(char *fnametmp,char *fname,struct file_struct *file)
+{
+	STRUCT_STAT st;
+	char *hashname = make_hash_name(file);
+	int first = 0, rc;
+	char *linkname;
+	long last_fnbr;
+
+	if (file->length == 0) {
+		return robust_rename(fnametmp,fname,0644);
+	}
+
+	if (do_stat(hashname, &st) == -1) {
+		char *dirname;
+
+		/* Directory does not exist. */
+		dirname = strdup(hashname);
+		*strrchr(dirname,'/') = 0;
+		if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
+			rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
+			free(hashname);
+			free(dirname);
+			return robust_rename(fnametmp,fname,0644);
+		}
+		free(dirname);
+
+		if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
+			rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
+			free(hashname);
+			return robust_rename(fnametmp,fname,0644);
+		}
+
+		first = 1;
+		asprintf(&linkname,"%s/0",hashname);
+		rprintf(FINFO, "(1) linkname = %s\n", linkname);
+	} else {
+		struct hashfile_struct *hashfiles, *hashfile;
+
+		if (do_stat(fnametmp,&st) == -1) {
+			rsyserr(FERROR, errno, "stat failed: %s", fname);
+			return -1;
+		}
+		hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
+
+		if (hashfiles == NULL) {
+			first = 1;
+			asprintf(&linkname,"%s/0",hashname);
+			rprintf(FINFO, "(2) linkname = %s\n", linkname);
+		} else {
+			int fd;
+			/* Search for one identical to us. */
+			if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
+				rsyserr(FERROR, errno, "open failed: %s", fnametmp);
+				kill_hashfiles(hashfiles);
+				return -1;
+			}
+			hashfile = compare_hashfiles(fd, hashfiles);
+			hashfiles = NULL;
+			close(fd);
+
+			if (hashfile) {
+				first = 0;
+				linkname = strdup(hashfile->name);
+				rprintf(FINFO, "(3) linkname = %s\n", linkname);
+				kill_hashfile(hashfile);
+			} else {
+				first = 1;
+				asprintf(&linkname, "%s/%ld", hashname,
+					 last_fnbr + 1);
+				rprintf(FINFO, "(4) linkname = %s\n", linkname);
+			}
+		}
+	}
+
+	if (!first) {
+		rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n",
+				linkname, full_fname(fname));
+		robust_unlink(fname);
+		rc = do_link(linkname, fname);
+		if (rc == -1) {
+			if (errno == EMLINK) {
+				first = 1;
+				free(linkname);
+				asprintf(&linkname,"%s/%ld",hashname,
+					 last_fnbr + 1);
+				rprintf(FINFO, "(5) linkname = %s\n", linkname);
+				rprintf(FINFO,"link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
+			} else {
+				rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
+					linkname, full_fname(fname));
+				rc = robust_rename(fnametmp,fname,0644);
+			}
+		} else {
+			do_unlink(fnametmp);
+		}
+	}
+
+	if (first) {
+		rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n",
+				full_fname(fname),linkname);
+
+		rc = robust_rename(fnametmp,fname,0644);
+		if (rc != 0) {
+			rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
+				full_fname(fnametmp), full_fname(fname));
+		}
+		rc = do_link(fname,linkname);
+		if (rc != 0) {
+			rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
+				full_fname(fname), linkname);
+		}
+	}
+
+	free(linkname);
+	free(hashname);
+	return rc;
+}
+
+#endif
--- orig/options.c	2005-07-28 18:48:38
+++ options.c	2005-05-19 08:55:42
@@ -141,6 +141,7 @@ char *log_format = NULL;
 char *password_file = NULL;
 char *rsync_path = RSYNC_PATH;
 char *backup_dir = NULL;
+char *link_by_hash_dir = NULL;
 char backup_dir_buf[MAXPATHLEN];
 int rsync_port = 0;
 int compare_dest = 0;
@@ -321,6 +322,7 @@ void usage(enum logcode F)
   rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
   rprintf(F,"     --copy-dest=DIR         ... and include copies of unchanged files\n");
   rprintf(F,"     --link-dest=DIR         hardlink to files in DIR when unchanged\n");
+  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash into DIR\n");
   rprintf(F," -z, --compress              compress file data during the transfer\n");
   rprintf(F," -C, --cvs-exclude           auto-ignore files the same way CVS does\n");
   rprintf(F," -f, --filter=RULE           add a file-filtering RULE\n");
@@ -362,7 +364,7 @@ void usage(enum logcode F)
 
 enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
       OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST,
-      OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW,
+      OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_LINK_BY_HASH,
       OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
       OPT_REFUSED_BASE = 9000};
 
@@ -432,6 +434,7 @@ static struct poptOption long_options[] 
   {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
   {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
   {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
+  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
   {"fuzzy",           'y', POPT_ARG_NONE,   &fuzzy_basis, 0, 0, 0 },
   /* TODO: Should this take an optional int giving the compression level? */
   {"compress",        'z', POPT_ARG_NONE,   &do_compression, 0, 0, 0 },
@@ -876,6 +879,21 @@ int parse_arguments(int *argc, const cha
 			basis_dir[basis_dir_cnt++] = (char *)arg;
 			break;
 
+                case OPT_LINK_BY_HASH:
+#if HAVE_LINK
+			arg = poptGetOptArg(pc);
+			if (sanitize_paths)
+				arg = sanitize_path(NULL, arg, NULL, 0);
+			link_by_hash_dir = (char *)arg;
+			break;
+#else
+			snprintf(err_buf, sizeof err_buf,
+				 "hard links are not supported on this %s\n",
+				 am_server ? "server" : "client");
+			rprintf(FERROR, "ERROR: %s", err_buf);
+			return 0;
+#endif
+
 		default:
 			/* A large opt value means that set_refuse_options()
 			 * turned this option off. */
@@ -1458,6 +1476,11 @@ void server_options(char **args,int *arg
 		}
 	}
 
+	if (link_by_hash_dir && am_sender) {
+		args[ac++] = "--link-by-hash";
+		args[ac++] = link_by_hash_dir;
+	}
+
 	if (files_from && (!am_sender || filesfrom_host)) {
 		if (filesfrom_host) {
 			args[ac++] = "--files-from";
--- orig/receiver.c	2005-07-28 18:48:38
+++ receiver.c	2005-01-15 21:29:13
@@ -52,6 +52,7 @@ extern int delay_updates;
 extern struct stats stats;
 extern char *log_format;
 extern char *tmpdir;
+extern char *link_by_hash_dir;
 extern char *partial_dir;
 extern char *basis_dir[];
 extern struct file_list *the_file_list;
@@ -185,12 +186,13 @@ static int get_tmpname(char *fnametmp, c
 
 
 static int receive_data(int f_in, char *fname_r, int fd_r, OFF_T size_r,
-			char *fname, int fd, OFF_T total_size)
+			char *fname, int fd, OFF_T total_size, char *md4)
 {
 	static char file_sum1[MD4_SUM_LENGTH];
 	static char file_sum2[MD4_SUM_LENGTH];
 	struct map_struct *mapbuf;
 	struct sum_struct sum;
+	struct mdfour mdfour_data;
 	int32 len;
 	OFF_T offset = 0;
 	OFF_T offset2;
@@ -210,6 +212,9 @@ static int receive_data(int f_in, char *
 	} else
 		mapbuf = NULL;
 
+	if (md4)
+		mdfour_begin(&mdfour_data);
+
 	sum_init(checksum_seed);
 
 	while ((i = recv_token(f_in, &data)) != 0) {
@@ -226,6 +231,8 @@ static int receive_data(int f_in, char *
 			cleanup_got_literal = 1;
 
 			sum_update(data, i);
+			if (md4)
+				mdfour_update(&mdfour_data,data,i);
 
 			if (fd != -1 && write_file(fd,data,i) != i)
 				goto report_write_error;
@@ -252,6 +259,8 @@ static int receive_data(int f_in, char *
 
 			see_token(map, len);
 			sum_update(map, len);
+			if (md4)
+				mdfour_update(&mdfour_data,map,len);
 		}
 
 		if (inplace) {
@@ -292,6 +301,8 @@ static int receive_data(int f_in, char *
 	}
 
 	sum_end(file_sum1);
+	if (md4)
+		mdfour_result(&mdfour_data, (unsigned char*)md4);
 
 	if (mapbuf)
 		unmap_file(mapbuf);
@@ -307,7 +318,7 @@ static int receive_data(int f_in, char *
 
 static void discard_receive_data(int f_in, OFF_T length)
 {
-	receive_data(f_in, NULL, -1, 0, NULL, -1, length);
+	receive_data(f_in, NULL, -1, 0, NULL, -1, length, NULL);
 }
 
 static void handle_delayed_updates(struct file_list *flist, char *local_name)
@@ -635,8 +646,12 @@ int recv_files(int f_in, struct file_lis
 			rprintf(FINFO, "%s\n", safe_fname(fname));
 
 		/* recv file data */
+#if HAVE_LINK
+		if (link_by_hash_dir)
+			file->u.sum = new_array(char, MD4_SUM_LENGTH);
+#endif
 		recv_ok = receive_data(f_in, fnamecmp, fd1, st.st_size,
-				       fname, fd2, file->length);
+				       fname, fd2, file->length, file->u.sum);
 
 		if (!log_before_transfer)
 			log_item(file, &initial_stats, iflags, NULL);
--- orig/rsync.c	2005-07-28 18:48:38
+++ rsync.c	2005-02-21 11:04:36
@@ -38,6 +38,7 @@ extern int inplace;
 extern int keep_dirlinks;
 extern int make_backups;
 extern struct stats stats;
+extern char *link_by_hash_dir;
 
 
 /*
@@ -188,7 +189,12 @@ void finish_transfer(char *fname, char *
 		rprintf(FINFO, "renaming %s to %s\n",
 			safe_fname(fnametmp), safe_fname(fname));
 	}
-	ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
+#if HAVE_LINK
+	if (link_by_hash_dir)
+		ret = link_by_hash(fnametmp, fname, file);
+	else
+#endif
+		ret = robust_rename(fnametmp, fname, file->mode & INITACCESSPERMS);
 	if (ret < 0) {
 		rsyserr(FERROR, errno, "%s %s -> \"%s\"",
 		    ret == -2 ? "copy" : "rename",
--- orig/rsync.h	2005-05-03 17:00:47
+++ rsync.h	2004-07-03 20:20:15
@@ -631,6 +631,14 @@ struct stats {
 	int current_file_index;
 };
 
+struct hashfile_struct {
+	struct hashfile_struct *next;
+	struct hashfile_struct *prev;
+	char *name;
+	int fd;
+	uint32 nlink;
+};
+
 
 #include "byteorder.h"
 #include "lib/mdfour.h"
--- orig/rsync.yo	2005-07-28 19:26:48
+++ rsync.yo	2005-02-13 06:58:47
@@ -354,6 +354,7 @@ to the detailed description below for a 
      --compare-dest=DIR      also compare received files relative to DIR
      --copy-dest=DIR         ... and include copies of unchanged files
      --link-dest=DIR         hardlink to files in DIR when unchanged
+     --link-by-hash=DIR      create hardlinks by hash into DIR
  -z, --compress              compress file data during the transfer
  -C, --cvs-exclude           auto-ignore files in the same way CVS does
  -f, --filter=RULE           add a file-filtering RULE
