* New command `nix-store --optimise' to reduce Nix store disk space
usage by finding identical files in the store and hard-linking them to each other. It typically reduces the size of the store by something like 25-35%. This is what the optimise-store.pl script did, but the new command is faster and more correct (it's safe wrt garbage collection and concurrent builds).
This commit is contained in:
		
							parent
							
								
									27a0662828
								
							
						
					
					
						commit
						a8629de827
					
				
					 5 changed files with 188 additions and 99 deletions
				
			
		|  | @ -1,91 +0,0 @@ | |||
| #! /usr/bin/perl -w | ||||
| 
 | ||||
| use strict; | ||||
| use File::Basename; | ||||
| 
 | ||||
| 
 | ||||
| my @paths = ("/nix/store"); | ||||
| 
 | ||||
| 
 | ||||
| print "hashing...\n"; | ||||
| 
 | ||||
| my $hashList = "/tmp/nix-optimise-hash-list"; | ||||
| 
 | ||||
| system("find @paths -type f -print0 | xargs -0 md5sum -- > $hashList") == 0 | ||||
|     or die "cannot hash store files"; | ||||
| 
 | ||||
| 
 | ||||
| print "sorting by hash...\n"; | ||||
| 
 | ||||
| system("sort $hashList > $hashList.sorted") == 0 | ||||
|     or die "cannot sort list"; | ||||
| 
 | ||||
| 
 | ||||
| sub atomicLink { | ||||
|     my $target = shift; | ||||
|     my $new = shift; | ||||
|     my $tmpNew = "${new}_optimise.$$"; | ||||
| 
 | ||||
|     # Make the directory writable temporarily. | ||||
|     my $dir = dirname $new; | ||||
|     my @st = stat $dir or die; | ||||
| 
 | ||||
|     chmod ($st[2] | 0200, $dir) or die "cannot make `$dir' writable: $!"; | ||||
|      | ||||
|     link $target, $tmpNew or die "cannot create hard link `$tmpNew': $!"; | ||||
| 
 | ||||
|     rename $tmpNew, $new or die "cannot rename `$tmpNew' to `$new': $!"; | ||||
| 
 | ||||
|     chmod ($st[2], $dir) or die "cannot restore permission on `$dir': $!"; | ||||
|     utime ($st[8], $st[9], $dir) or die "cannot restore timestamp on `$dir': $!"; | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| print "hard-linking...\n"; | ||||
| 
 | ||||
| open LIST, "<$hashList.sorted" or die; | ||||
| 
 | ||||
| my $prevFile; | ||||
| my $prevHash; | ||||
| my $prevInode; | ||||
| my $prevExec; | ||||
| 
 | ||||
| my $totalSpace = 0; | ||||
| my $savedSpace = 0; | ||||
| 
 | ||||
| while (<LIST>) { | ||||
|     /^([0-9a-f]*)\s+(.*)$/ or die; | ||||
|     my $curFile = $2; | ||||
|     my $curHash = $1; | ||||
| 
 | ||||
|     my @st = stat $curFile or die; | ||||
|     next if ($st[2] & 0222) != 0; # skip writable files | ||||
| 
 | ||||
|     my $fileSize = $st[7]; | ||||
|     $totalSpace += $fileSize; | ||||
|     my $isExec = ($st[2] & 0111) == 0111; | ||||
| 
 | ||||
|     if (defined $prevHash && $curHash eq $prevHash | ||||
|         && $prevExec == $isExec) | ||||
|     { | ||||
|          | ||||
|         if ($st[1] != $prevInode) { | ||||
|             print "$curFile = $prevFile\n"; | ||||
|             atomicLink $prevFile, $curFile; | ||||
|             $savedSpace += $fileSize; | ||||
|         } | ||||
|          | ||||
|     } else { | ||||
|         $prevFile = $curFile; | ||||
|         $prevHash = $curHash; | ||||
|         $prevInode = $st[1]; | ||||
|         $prevExec = ($st[2] & 0111) == 0111; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| print "total space = $totalSpace\n"; | ||||
| print "saved space = $savedSpace\n"; | ||||
| my $savings = ($savedSpace / $totalSpace) * 100.0; | ||||
| print "savings = $savings %\n"; | ||||
| 
 | ||||
| close LIST; | ||||
|  | @ -174,7 +174,7 @@ void copyPath(const Path & src, const Path & dst, PathFilter & filter) | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| static void _canonicalisePathMetaData(const Path & path) | ||||
| static void _canonicalisePathMetaData(const Path & path, bool recurse) | ||||
| { | ||||
|     checkInterrupt(); | ||||
| 
 | ||||
|  | @ -223,17 +223,17 @@ static void _canonicalisePathMetaData(const Path & path) | |||
| 
 | ||||
|     } | ||||
| 
 | ||||
|     if (S_ISDIR(st.st_mode)) { | ||||
|     if (recurse && S_ISDIR(st.st_mode)) { | ||||
|         Strings names = readDirectory(path); | ||||
| 	for (Strings::iterator i = names.begin(); i != names.end(); ++i) | ||||
| 	    _canonicalisePathMetaData(path + "/" + *i); | ||||
| 	    _canonicalisePathMetaData(path + "/" + *i, true); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| void canonicalisePathMetaData(const Path & path) | ||||
| { | ||||
|     _canonicalisePathMetaData(path); | ||||
|     _canonicalisePathMetaData(path, true); | ||||
| 
 | ||||
|     /* On platforms that don't have lchown(), the top-level path can't
 | ||||
|        be a symlink, since we can't change its ownership. */ | ||||
|  | @ -625,7 +625,7 @@ void LocalStore::exportPath(const Path & path, bool sign, | |||
|        consistent metadata. */ | ||||
|     Transaction txn(nixDB); | ||||
|     addTempRoot(path); | ||||
|     if (!isValidPath(path)) | ||||
|     if (!isValidPathTxn(txn, path)) | ||||
|         throw Error(format("path `%1%' is not valid") % path); | ||||
| 
 | ||||
|     HashAndWriteSink hashAndWriteSink(sink); | ||||
|  | @ -950,6 +950,121 @@ void verifyStore(bool checkContents) | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| typedef std::map<Hash, std::pair<Path, ino_t> > HashToPath; | ||||
| 
 | ||||
| 
 | ||||
| static void toggleWritable(const Path & path, bool writable) | ||||
| { | ||||
|     struct stat st; | ||||
|     if (lstat(path.c_str(), &st)) | ||||
| 	throw SysError(format("getting attributes of path `%1%'") % path); | ||||
| 
 | ||||
|     mode_t mode = st.st_mode; | ||||
|     if (writable) mode |= S_IWUSR; | ||||
|     else mode &= ~(S_IWUSR | S_IWGRP | S_IWOTH); | ||||
|      | ||||
|     if (chmod(path.c_str(), mode) == -1) | ||||
|         throw SysError(format("changing writability of `%1%'") % path); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| static void hashAndLink(bool dryRun, HashToPath & hashToPath, | ||||
|     OptimiseStats & stats, const Path & path) | ||||
| { | ||||
|     struct stat st; | ||||
|     if (lstat(path.c_str(), &st)) | ||||
| 	throw SysError(format("getting attributes of path `%1%'") % path); | ||||
| 
 | ||||
|     /* Sometimes SNAFUs can cause files in the Nix store to be
 | ||||
|        modified, in particular when running programs as root under | ||||
|        NixOS (example: $fontconfig/var/cache being modified).  Skip | ||||
|        those files. */ | ||||
|     if (S_ISREG(st.st_mode) && (st.st_mode & S_IWUSR)) { | ||||
|         printMsg(lvlError, format("skipping suspicious writable file `%1%'") % path); | ||||
|         return; | ||||
|     } | ||||
| 
 | ||||
|     /* We can hard link regular files and symlinks. */ | ||||
|     if (S_ISREG(st.st_mode) || S_ISLNK(st.st_mode)) { | ||||
| 
 | ||||
|         /* Hash the file.  Note that hashPath() returns the hash over
 | ||||
|            the NAR serialisation, which includes the execute bit on | ||||
|            the file.  Thus, executable and non-executable files with | ||||
|            the same contents *won't* be linked (which is good because | ||||
|            otherwise the permissions would be screwed up). | ||||
| 
 | ||||
|            Also note that if `path' is a symlink, then we're hashing | ||||
|            the contents of the symlink (i.e. the result of | ||||
|            readlink()), not the contents of the target (which may not | ||||
|            even exist). */ | ||||
|         Hash hash = hashPath(htSHA256, path); | ||||
|         stats.totalFiles++; | ||||
|         printMsg(lvlDebug, format("`%1%' has hash `%2%'") % path % printHash(hash)); | ||||
| 
 | ||||
|         std::pair<Path, ino_t> prevPath = hashToPath[hash]; | ||||
|          | ||||
|         if (prevPath.first == "") { | ||||
|             hashToPath[hash] = std::pair<Path, ino_t>(path, st.st_ino); | ||||
|             return; | ||||
|         } | ||||
|              | ||||
|         /* Yes!  We've seen a file with the same contents.  Replace
 | ||||
|            the current file with a hard link to that file. */ | ||||
|         stats.sameContents++; | ||||
|         if (prevPath.second == st.st_ino) { | ||||
|             printMsg(lvlDebug, format("`%1%' is already linked to `%2%'") % path % prevPath.first); | ||||
|             return; | ||||
|         } | ||||
|          | ||||
|         printMsg(lvlTalkative, format("linking `%1%' to `%2%'") % path % prevPath.first); | ||||
| 
 | ||||
|         Path tempLink = (format("%1%.tmp-%2%-%3%") | ||||
|             % path % getpid() % rand()).str(); | ||||
| 
 | ||||
|         toggleWritable(dirOf(path), true); | ||||
|          | ||||
|         if (link(prevPath.first.c_str(), tempLink.c_str()) == -1) | ||||
|             throw SysError(format("cannot link `%1%' to `%2%'") | ||||
|                 % tempLink % prevPath.first); | ||||
| 
 | ||||
|         /* Atomically replace the old file with the new hard link. */ | ||||
|         if (rename(tempLink.c_str(), path.c_str()) == -1) | ||||
|             throw SysError(format("cannot rename `%1%' to `%2%'") | ||||
|                 % tempLink % path); | ||||
| 
 | ||||
|         /* Make the directory read-only again and reset its timestamp
 | ||||
|            back to 0. */ | ||||
|         _canonicalisePathMetaData(dirOf(path), false); | ||||
|          | ||||
|         stats.filesLinked++; | ||||
|         stats.bytesFreed += st.st_size; | ||||
|     } | ||||
| 
 | ||||
|     if (S_ISDIR(st.st_mode)) { | ||||
|         Strings names = readDirectory(path); | ||||
| 	for (Strings::iterator i = names.begin(); i != names.end(); ++i) | ||||
| 	    hashAndLink(dryRun, hashToPath, stats, path + "/" + *i); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| void LocalStore::optimiseStore(bool dryRun, OptimiseStats & stats) | ||||
| { | ||||
|     HashToPath hashToPath; | ||||
|      | ||||
|     Paths paths; | ||||
|     PathSet validPaths; | ||||
|     nixDB.enumTable(noTxn, dbValidPaths, paths); | ||||
| 
 | ||||
|     for (Paths::iterator i = paths.begin(); i != paths.end(); ++i) { | ||||
|         addTempRoot(*i); | ||||
|         if (!isValidPath(*i)) continue; /* path was GC'ed, probably */ | ||||
|         startNest(nest, lvlChatty, format("hashing files in `%1%'") % *i); | ||||
|         hashAndLink(dryRun, hashToPath, stats, *i); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /* Upgrade from schema 1 (Nix <= 0.7) to schema 2 (Nix >= 0.8). */ | ||||
| static void upgradeStore07() | ||||
| { | ||||
|  |  | |||
|  | @ -21,6 +21,20 @@ const int nixSchemaVersion = 4; | |||
| extern string drvsLogDir; | ||||
| 
 | ||||
| 
 | ||||
| struct OptimiseStats | ||||
| { | ||||
|     unsigned long totalFiles; | ||||
|     unsigned long sameContents; | ||||
|     unsigned long filesLinked; | ||||
|     unsigned long long bytesFreed; | ||||
|     OptimiseStats() | ||||
|     { | ||||
|         totalFiles = sameContents = filesLinked = 0; | ||||
|         bytesFreed = 0; | ||||
|     } | ||||
| }; | ||||
| 
 | ||||
| 
 | ||||
| class LocalStore : public StoreAPI | ||||
| { | ||||
| private: | ||||
|  | @ -83,6 +97,10 @@ public: | |||
| 
 | ||||
|     void collectGarbage(GCAction action, const PathSet & pathsToDelete, | ||||
|         bool ignoreLiveness, PathSet & result, unsigned long long & bytesFreed); | ||||
| 
 | ||||
|     /* Optimise the disk space usage of the Nix store by hard-linking
 | ||||
|        files with the same contents. */ | ||||
|     void optimiseStore(bool dryRun, OptimiseStats & stats); | ||||
| }; | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -21,6 +21,7 @@ Operations: | |||
| 
 | ||||
|   --init: initialise the Nix database | ||||
|   --verify: verify Nix structures | ||||
|   --optimise: optimise the Nix store by hard-linking identical files | ||||
| 
 | ||||
|   --version: output version information | ||||
|   --help: display help | ||||
|  |  | |||
|  | @ -466,6 +466,13 @@ static void opCheckValidity(Strings opFlags, Strings opArgs) | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| static string showBytes(unsigned long long bytes) | ||||
| { | ||||
|     return (format("%d bytes (%.2f MiB)") | ||||
|         % bytes % (bytes / (1024.0 * 1024.0))).str(); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| struct PrintFreed  | ||||
| { | ||||
|     bool show, dryRun; | ||||
|  | @ -477,9 +484,9 @@ struct PrintFreed | |||
|         if (show) | ||||
|             cout << format( | ||||
|                 (dryRun | ||||
|                     ? "%d bytes would be freed (%.2f MiB)\n" | ||||
|                     : "%d bytes freed (%.2f MiB)\n")) | ||||
|                 % bytesFreed % (bytesFreed / (1024.0 * 1024.0)); | ||||
|                     ? "%1% would be freed\n" | ||||
|                     : "%1% freed (%.2f MiB)\n")) | ||||
|                 % showBytes(bytesFreed); | ||||
|     } | ||||
| }; | ||||
| 
 | ||||
|  | @ -614,6 +621,43 @@ static void opVerify(Strings opFlags, Strings opArgs) | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| static void showOptimiseStats(OptimiseStats & stats) | ||||
| { | ||||
|     printMsg(lvlError, | ||||
|         format("%1% freed by hard-linking %2% files; there are %3% files with equal contents out of %4% files in total") | ||||
|         % showBytes(stats.bytesFreed) | ||||
|         % stats.filesLinked | ||||
|         % stats.sameContents | ||||
|         % stats.totalFiles); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /* Optimise the disk space usage of the Nix store by hard-linking
 | ||||
|    files with the same contents. */ | ||||
| static void opOptimise(Strings opFlags, Strings opArgs) | ||||
| { | ||||
|     if (!opArgs.empty()) | ||||
|         throw UsageError("no arguments expected"); | ||||
| 
 | ||||
|     for (Strings::iterator i = opFlags.begin(); | ||||
|          i != opFlags.end(); ++i) | ||||
|         throw UsageError(format("unknown flag `%1%'") % *i); | ||||
| 
 | ||||
|     LocalStore * store2(dynamic_cast<LocalStore *>(store.get())); | ||||
|     if (!store2) throw Error("you don't have sufficient rights to use --optimise"); | ||||
| 
 | ||||
|     OptimiseStats stats; | ||||
|     try { | ||||
|         store2->optimiseStore(true, stats); | ||||
|     } catch (...) { | ||||
|         showOptimiseStats(stats); | ||||
|         throw; | ||||
|     } | ||||
|     showOptimiseStats(stats); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| /* Scan the arguments; find the operation, set global flags, put all
 | ||||
|    other flags in a list, and put all other arguments in another | ||||
|    list. */ | ||||
|  | @ -659,6 +703,8 @@ void run(Strings args) | |||
|             op = opInit; | ||||
|         else if (arg == "--verify") | ||||
|             op = opVerify; | ||||
|         else if (arg == "--optimise") | ||||
|             op = opOptimise; | ||||
|         else if (arg == "--add-root") { | ||||
|             if (i == args.end()) | ||||
|                 throw UsageError("`--add-root requires an argument"); | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue