* New command `nix-store --optimise' to reduce Nix store disk space
usage by finding identical files in the store and hard-linking them to each other. It typically reduces the size of the store by something like 25-35%. This is what the optimise-store.pl script did, but the new command is faster and more correct (it's safe wrt garbage collection and concurrent builds).
This commit is contained in:
		
							parent
							
								
									27a0662828
								
							
						
					
					
						commit
						a8629de827
					
				
					 5 changed files with 188 additions and 99 deletions
				
			
		|  | @ -1,91 +0,0 @@ | ||||||
| #! /usr/bin/perl -w |  | ||||||
| 
 |  | ||||||
| use strict; |  | ||||||
| use File::Basename; |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| my @paths = ("/nix/store"); |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| print "hashing...\n"; |  | ||||||
| 
 |  | ||||||
| my $hashList = "/tmp/nix-optimise-hash-list"; |  | ||||||
| 
 |  | ||||||
| system("find @paths -type f -print0 | xargs -0 md5sum -- > $hashList") == 0 |  | ||||||
|     or die "cannot hash store files"; |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| print "sorting by hash...\n"; |  | ||||||
| 
 |  | ||||||
| system("sort $hashList > $hashList.sorted") == 0 |  | ||||||
|     or die "cannot sort list"; |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| sub atomicLink { |  | ||||||
|     my $target = shift; |  | ||||||
|     my $new = shift; |  | ||||||
|     my $tmpNew = "${new}_optimise.$$"; |  | ||||||
| 
 |  | ||||||
|     # Make the directory writable temporarily. |  | ||||||
|     my $dir = dirname $new; |  | ||||||
|     my @st = stat $dir or die; |  | ||||||
| 
 |  | ||||||
|     chmod ($st[2] | 0200, $dir) or die "cannot make `$dir' writable: $!"; |  | ||||||
|      |  | ||||||
|     link $target, $tmpNew or die "cannot create hard link `$tmpNew': $!"; |  | ||||||
| 
 |  | ||||||
|     rename $tmpNew, $new or die "cannot rename `$tmpNew' to `$new': $!"; |  | ||||||
| 
 |  | ||||||
|     chmod ($st[2], $dir) or die "cannot restore permission on `$dir': $!"; |  | ||||||
|     utime ($st[8], $st[9], $dir) or die "cannot restore timestamp on `$dir': $!"; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| print "hard-linking...\n"; |  | ||||||
| 
 |  | ||||||
| open LIST, "<$hashList.sorted" or die; |  | ||||||
| 
 |  | ||||||
| my $prevFile; |  | ||||||
| my $prevHash; |  | ||||||
| my $prevInode; |  | ||||||
| my $prevExec; |  | ||||||
| 
 |  | ||||||
| my $totalSpace = 0; |  | ||||||
| my $savedSpace = 0; |  | ||||||
| 
 |  | ||||||
| while (<LIST>) { |  | ||||||
|     /^([0-9a-f]*)\s+(.*)$/ or die; |  | ||||||
|     my $curFile = $2; |  | ||||||
|     my $curHash = $1; |  | ||||||
| 
 |  | ||||||
|     my @st = stat $curFile or die; |  | ||||||
|     next if ($st[2] & 0222) != 0; # skip writable files |  | ||||||
| 
 |  | ||||||
|     my $fileSize = $st[7]; |  | ||||||
|     $totalSpace += $fileSize; |  | ||||||
|     my $isExec = ($st[2] & 0111) == 0111; |  | ||||||
| 
 |  | ||||||
|     if (defined $prevHash && $curHash eq $prevHash |  | ||||||
|         && $prevExec == $isExec) |  | ||||||
|     { |  | ||||||
|          |  | ||||||
|         if ($st[1] != $prevInode) { |  | ||||||
|             print "$curFile = $prevFile\n"; |  | ||||||
|             atomicLink $prevFile, $curFile; |  | ||||||
|             $savedSpace += $fileSize; |  | ||||||
|         } |  | ||||||
|          |  | ||||||
|     } else { |  | ||||||
|         $prevFile = $curFile; |  | ||||||
|         $prevHash = $curHash; |  | ||||||
|         $prevInode = $st[1]; |  | ||||||
|         $prevExec = ($st[2] & 0111) == 0111; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| print "total space = $totalSpace\n"; |  | ||||||
| print "saved space = $savedSpace\n"; |  | ||||||
| my $savings = ($savedSpace / $totalSpace) * 100.0; |  | ||||||
| print "savings = $savings %\n"; |  | ||||||
| 
 |  | ||||||
| close LIST; |  | ||||||
|  | @ -174,7 +174,7 @@ void copyPath(const Path & src, const Path & dst, PathFilter & filter) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| static void _canonicalisePathMetaData(const Path & path) | static void _canonicalisePathMetaData(const Path & path, bool recurse) | ||||||
| { | { | ||||||
|     checkInterrupt(); |     checkInterrupt(); | ||||||
| 
 | 
 | ||||||
|  | @ -223,17 +223,17 @@ static void _canonicalisePathMetaData(const Path & path) | ||||||
| 
 | 
 | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     if (S_ISDIR(st.st_mode)) { |     if (recurse && S_ISDIR(st.st_mode)) { | ||||||
|         Strings names = readDirectory(path); |         Strings names = readDirectory(path); | ||||||
| 	for (Strings::iterator i = names.begin(); i != names.end(); ++i) | 	for (Strings::iterator i = names.begin(); i != names.end(); ++i) | ||||||
| 	    _canonicalisePathMetaData(path + "/" + *i); | 	    _canonicalisePathMetaData(path + "/" + *i, true); | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| void canonicalisePathMetaData(const Path & path) | void canonicalisePathMetaData(const Path & path) | ||||||
| { | { | ||||||
|     _canonicalisePathMetaData(path); |     _canonicalisePathMetaData(path, true); | ||||||
| 
 | 
 | ||||||
|     /* On platforms that don't have lchown(), the top-level path can't
 |     /* On platforms that don't have lchown(), the top-level path can't
 | ||||||
|        be a symlink, since we can't change its ownership. */ |        be a symlink, since we can't change its ownership. */ | ||||||
|  | @ -625,7 +625,7 @@ void LocalStore::exportPath(const Path & path, bool sign, | ||||||
|        consistent metadata. */ |        consistent metadata. */ | ||||||
|     Transaction txn(nixDB); |     Transaction txn(nixDB); | ||||||
|     addTempRoot(path); |     addTempRoot(path); | ||||||
|     if (!isValidPath(path)) |     if (!isValidPathTxn(txn, path)) | ||||||
|         throw Error(format("path `%1%' is not valid") % path); |         throw Error(format("path `%1%' is not valid") % path); | ||||||
| 
 | 
 | ||||||
|     HashAndWriteSink hashAndWriteSink(sink); |     HashAndWriteSink hashAndWriteSink(sink); | ||||||
|  | @ -950,6 +950,121 @@ void verifyStore(bool checkContents) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | typedef std::map<Hash, std::pair<Path, ino_t> > HashToPath; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | static void toggleWritable(const Path & path, bool writable) | ||||||
|  | { | ||||||
|  |     struct stat st; | ||||||
|  |     if (lstat(path.c_str(), &st)) | ||||||
|  | 	throw SysError(format("getting attributes of path `%1%'") % path); | ||||||
|  | 
 | ||||||
|  |     mode_t mode = st.st_mode; | ||||||
|  |     if (writable) mode |= S_IWUSR; | ||||||
|  |     else mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH); | ||||||
|  |      | ||||||
|  |     if (chmod(path.c_str(), mode) == -1) | ||||||
|  |         throw SysError(format("changing writability of `%1%'") % path); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | static void hashAndLink(bool dryRun, HashToPath & hashToPath, | ||||||
|  |     OptimiseStats & stats, const Path & path) | ||||||
|  | { | ||||||
|  |     struct stat st; | ||||||
|  |     if (lstat(path.c_str(), &st)) | ||||||
|  | 	throw SysError(format("getting attributes of path `%1%'") % path); | ||||||
|  | 
 | ||||||
|  |     /* Sometimes SNAFUs can cause files in the Nix store to be
 | ||||||
|  |        modified, in particular when running programs as root under | ||||||
|  |        NixOS (example: $fontconfig/var/cache being modified).  Skip | ||||||
|  |        those files. */ | ||||||
|  |     if (S_ISREG(st.st_mode) && (st.st_mode & S_IWUSR)) { | ||||||
|  |         printMsg(lvlError, format("skipping suspicious writable file `%1%'") % path); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     /* We can hard link regular files and symlinks. */ | ||||||
|  |     if (S_ISREG(st.st_mode) || S_ISLNK(st.st_mode)) { | ||||||
|  | 
 | ||||||
|  |         /* Hash the file.  Note that hashPath() returns the hash over
 | ||||||
|  |            the NAR serialisation, which includes the execute bit on | ||||||
|  |            the file.  Thus, executable and non-executable files with | ||||||
|  |            the same contents *won't* be linked (which is good because | ||||||
|  |            otherwise the permissions would be screwed up). | ||||||
|  | 
 | ||||||
|  |            Also note that if `path' is a symlink, then we're hashing | ||||||
|  |            the contents of the symlink (i.e. the result of | ||||||
|  |            readlink()), not the contents of the target (which may not | ||||||
|  |            even exist). */ | ||||||
|  |         Hash hash = hashPath(htSHA256, path); | ||||||
|  |         stats.totalFiles++; | ||||||
|  |         printMsg(lvlDebug, format("`%1%' has hash `%2%'") % path % printHash(hash)); | ||||||
|  | 
 | ||||||
|  |         std::pair<Path, ino_t> prevPath = hashToPath[hash]; | ||||||
|  |          | ||||||
|  |         if (prevPath.first == "") { | ||||||
|  |             hashToPath[hash] = std::pair<Path, ino_t>(path, st.st_ino); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |              | ||||||
|  |         /* Yes!  We've seen a file with the same contents.  Replace
 | ||||||
|  |            the current file with a hard link to that file. */ | ||||||
|  |         stats.sameContents++; | ||||||
|  |         if (prevPath.second == st.st_ino) { | ||||||
|  |             printMsg(lvlDebug, format("`%1%' is already linked to `%2%'") % path % prevPath.first); | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  |          | ||||||
|  |         printMsg(lvlTalkative, format("linking `%1%' to `%2%'") % path % prevPath.first); | ||||||
|  | 
 | ||||||
|  |         Path tempLink = (format("%1%.tmp-%2%-%3%") | ||||||
|  |             % path % getpid() % rand()).str(); | ||||||
|  | 
 | ||||||
|  |         toggleWritable(dirOf(path), true); | ||||||
|  |          | ||||||
|  |         if (link(prevPath.first.c_str(), tempLink.c_str()) == -1) | ||||||
|  |             throw SysError(format("cannot link `%1%' to `%2%'") | ||||||
|  |                 % tempLink % prevPath.first); | ||||||
|  | 
 | ||||||
|  |         /* Atomically replace the old file with the new hard link. */ | ||||||
|  |         if (rename(tempLink.c_str(), path.c_str()) == -1) | ||||||
|  |             throw SysError(format("cannot rename `%1%' to `%2%'") | ||||||
|  |                 % tempLink % path); | ||||||
|  | 
 | ||||||
|  |         /* Make the directory read-only again and reset its timestamp
 | ||||||
|  |            back to 0. */ | ||||||
|  |         _canonicalisePathMetaData(dirOf(path), false); | ||||||
|  |          | ||||||
|  |         stats.filesLinked++; | ||||||
|  |         stats.bytesFreed += st.st_size; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (S_ISDIR(st.st_mode)) { | ||||||
|  |         Strings names = readDirectory(path); | ||||||
|  | 	for (Strings::iterator i = names.begin(); i != names.end(); ++i) | ||||||
|  | 	    hashAndLink(dryRun, hashToPath, stats, path + "/" + *i); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | void LocalStore::optimiseStore(bool dryRun, OptimiseStats & stats) | ||||||
|  | { | ||||||
|  |     HashToPath hashToPath; | ||||||
|  |      | ||||||
|  |     Paths paths; | ||||||
|  |     PathSet validPaths; | ||||||
|  |     nixDB.enumTable(noTxn, dbValidPaths, paths); | ||||||
|  | 
 | ||||||
|  |     for (Paths::iterator i = paths.begin(); i != paths.end(); ++i) { | ||||||
|  |         addTempRoot(*i); | ||||||
|  |         if (!isValidPath(*i)) continue; /* path was GC'ed, probably */ | ||||||
|  |         startNest(nest, lvlChatty, format("hashing files in `%1%'") % *i); | ||||||
|  |         hashAndLink(dryRun, hashToPath, stats, *i); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| /* Upgrade from schema 1 (Nix <= 0.7) to schema 2 (Nix >= 0.8). */ | /* Upgrade from schema 1 (Nix <= 0.7) to schema 2 (Nix >= 0.8). */ | ||||||
| static void upgradeStore07() | static void upgradeStore07() | ||||||
| { | { | ||||||
|  |  | ||||||
|  | @ -21,6 +21,20 @@ const int nixSchemaVersion = 4; | ||||||
| extern string drvsLogDir; | extern string drvsLogDir; | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | struct OptimiseStats | ||||||
|  | { | ||||||
|  |     unsigned long totalFiles; | ||||||
|  |     unsigned long sameContents; | ||||||
|  |     unsigned long filesLinked; | ||||||
|  |     unsigned long long bytesFreed; | ||||||
|  |     OptimiseStats() | ||||||
|  |     { | ||||||
|  |         totalFiles = sameContents = filesLinked = 0; | ||||||
|  |         bytesFreed = 0; | ||||||
|  |     } | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class LocalStore : public StoreAPI | class LocalStore : public StoreAPI | ||||||
| { | { | ||||||
| private: | private: | ||||||
|  | @ -83,6 +97,10 @@ public: | ||||||
| 
 | 
 | ||||||
|     void collectGarbage(GCAction action, const PathSet & pathsToDelete, |     void collectGarbage(GCAction action, const PathSet & pathsToDelete, | ||||||
|         bool ignoreLiveness, PathSet & result, unsigned long long & bytesFreed); |         bool ignoreLiveness, PathSet & result, unsigned long long & bytesFreed); | ||||||
|  | 
 | ||||||
|  |     /* Optimise the disk space usage of the Nix store by hard-linking
 | ||||||
|  |        files with the same contents. */ | ||||||
|  |     void optimiseStore(bool dryRun, OptimiseStats & stats); | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -21,6 +21,7 @@ Operations: | ||||||
| 
 | 
 | ||||||
|   --init: initialise the Nix database |   --init: initialise the Nix database | ||||||
|   --verify: verify Nix structures |   --verify: verify Nix structures | ||||||
|  |   --optimise: optimise the Nix store by hard-linking identical files | ||||||
| 
 | 
 | ||||||
|   --version: output version information |   --version: output version information | ||||||
|   --help: display help |   --help: display help | ||||||
|  |  | ||||||
|  | @ -466,6 +466,13 @@ static void opCheckValidity(Strings opFlags, Strings opArgs) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | static string showBytes(unsigned long long bytes) | ||||||
|  | { | ||||||
|  |     return (format("%d bytes (%.2f MiB)") | ||||||
|  |         % bytes % (bytes / (1024.0 * 1024.0))).str(); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| struct PrintFreed  | struct PrintFreed  | ||||||
| { | { | ||||||
|     bool show, dryRun; |     bool show, dryRun; | ||||||
|  | @ -477,9 +484,9 @@ struct PrintFreed | ||||||
|         if (show) |         if (show) | ||||||
|             cout << format( |             cout << format( | ||||||
|                 (dryRun |                 (dryRun | ||||||
|                     ? "%d bytes would be freed (%.2f MiB)\n" |                     ? "%1% would be freed\n" | ||||||
|                     : "%d bytes freed (%.2f MiB)\n")) |                     : "%1% freed (%.2f MiB)\n")) | ||||||
|                 % bytesFreed % (bytesFreed / (1024.0 * 1024.0)); |                 % showBytes(bytesFreed); | ||||||
|     } |     } | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | @ -614,6 +621,43 @@ static void opVerify(Strings opFlags, Strings opArgs) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | static void showOptimiseStats(OptimiseStats & stats) | ||||||
|  | { | ||||||
|  |     printMsg(lvlError, | ||||||
|  |         format("%1% freed by hard-linking %2% files; there are %3% files with equal contents out of %4% files in total") | ||||||
|  |         % showBytes(stats.bytesFreed) | ||||||
|  |         % stats.filesLinked | ||||||
|  |         % stats.sameContents | ||||||
|  |         % stats.totalFiles); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | /* Optimise the disk space usage of the Nix store by hard-linking
 | ||||||
|  |    files with the same contents. */ | ||||||
|  | static void opOptimise(Strings opFlags, Strings opArgs) | ||||||
|  | { | ||||||
|  |     if (!opArgs.empty()) | ||||||
|  |         throw UsageError("no arguments expected"); | ||||||
|  | 
 | ||||||
|  |     for (Strings::iterator i = opFlags.begin(); | ||||||
|  |          i != opFlags.end(); ++i) | ||||||
|  |         throw UsageError(format("unknown flag `%1%'") % *i); | ||||||
|  | 
 | ||||||
|  |     LocalStore * store2(dynamic_cast<LocalStore *>(store.get())); | ||||||
|  |     if (!store2) throw Error("you don't have sufficient rights to use --optimise"); | ||||||
|  | 
 | ||||||
|  |     OptimiseStats stats; | ||||||
|  |     try { | ||||||
|  |         store2->optimiseStore(true, stats); | ||||||
|  |     } catch (...) { | ||||||
|  |         showOptimiseStats(stats); | ||||||
|  |         throw; | ||||||
|  |     } | ||||||
|  |     showOptimiseStats(stats); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| /* Scan the arguments; find the operation, set global flags, put all
 | /* Scan the arguments; find the operation, set global flags, put all
 | ||||||
|    other flags in a list, and put all other arguments in another |    other flags in a list, and put all other arguments in another | ||||||
|    list. */ |    list. */ | ||||||
|  | @ -659,6 +703,8 @@ void run(Strings args) | ||||||
|             op = opInit; |             op = opInit; | ||||||
|         else if (arg == "--verify") |         else if (arg == "--verify") | ||||||
|             op = opVerify; |             op = opVerify; | ||||||
|  |         else if (arg == "--optimise") | ||||||
|  |             op = opOptimise; | ||||||
|         else if (arg == "--add-root") { |         else if (arg == "--add-root") { | ||||||
|             if (i == args.end()) |             if (i == args.end()) | ||||||
|                 throw UsageError("`--add-root requires an argument"); |                 throw UsageError("`--add-root requires an argument"); | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue