diff --git a/Makefile.am b/Makefile.am index 446eb39..2e30583 100644 --- a/Makefile.am +++ b/Makefile.am @@ -18,7 +18,8 @@ TESTS=testcases/largefilesupport.sh \ testcases/verify_deterministic_operation.sh \ testcases/checksum_options.sh \ testcases/md5collisions.sh \ - testcases/sha1collisions.sh + testcases/sha1collisions.sh \ + testcases/hardlink_groups.sh AUXFILES=testcases/common_funcs.sh \ testcases/md5collisions/letter_of_rec.ps \ diff --git a/README.md b/README.md index 646c203..dcc83a3 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Rdfind uses the following algorithm. If N is the number of files to search throu 2. For each argument, list the directory contents recursively and assign it to the file list. Assign a directory depth number, starting at 0 for every argument. 3. If the input argument is a file, add it to the file list. 4. Loop over the list, and find out the sizes of all files. -5. If flag -removeidentinode true: Remove items from the list which already are added, based on the combination of inode and device number. A group of files that are hardlinked to the same file are collapsed to one entry. Also see the comment on hardlinks under ”caveats below”! +5. If flag -removeidentinode true: Remove items from the list which already are added, based on the combination of inode and device number. A group of files that are hardlinked to the same file are collapsed to one entry. If flag -rememberidentinode true the removed files are rememberd and included in the final result. Also see the comment on hardlinks under ”caveats below”! 6. Sort files on size. Remove files from the list, which have unique sizes. 7. Sort on device and inode(speeds up file reading). Read a few bytes from the beginning of each file (first bytes). 8. Remove files from list that have the same size but different first bytes. diff --git a/Rdutil.cc b/Rdutil.cc index f098d2c..9480064 100644 --- a/Rdutil.cc +++ b/Rdutil.cc @@ -44,13 +44,15 @@ Rdutil::printtofile(const std::string& filename) const output << "# Automatically generated\n"; output << "# duptype id depth size device inode priority name\n"; - std::vector::iterator it; - for (it = m_list.begin(); it != m_list.end(); ++it) { - output << Fileinfo::getduptypestring(*it) << " " << it->getidentity() << " " - << it->depth() << " " << it->size() << " " << it->device() << " " - << it->inode() << " " << it->get_cmdline_index() << " " << it->name() - << '\n'; - } + process_result( + [&output](Fileinfo& it) { + output << Fileinfo::getduptypestring(it) << " " << it.getidentity() << " " + << it.depth() << " " << it.size() << " " << it.device() << " " + << it.inode() << " " << it.get_cmdline_index() << " " << it.name() + << '\n'; + } + ); + output << "# end of file\n"; f1.close(); return 0; @@ -61,44 +63,42 @@ Rdutil::printtofile(const std::string& filename) const // returns how many times the function was invoked. template std::size_t -applyactiononfile(std::vector& m_list, Function f) +Rdutil::applyactiononfile(Function f) const { - - const auto first = m_list.begin(); - const auto last = m_list.end(); - auto original = last; - + Fileinfo* original = NULL; std::size_t ntimesapplied = 0; - // loop over files - for (auto it = first; it != last; ++it) { - switch (it->getduptype()) { - case Fileinfo::duptype::DUPTYPE_FIRST_OCCURRENCE: { - original = it; - assert(original->getidentity() >= 0 && - "original file should have positive identity"); - } break; - - case Fileinfo::duptype::DUPTYPE_OUTSIDE_TREE: - // intentional fallthrough - case Fileinfo::duptype::DUPTYPE_WITHIN_SAME_TREE: { - assert(original != last); - // double check that "it" shall be ~linked to "src" - assert(it->getidentity() == -original->getidentity() && - "it must be connected to src"); - // everything is in order. we may now hardlink/symlink/remove it. - if (f(*it, *original)) { - RDDEBUG(__FILE__ ": Failed to apply function f on it.\n"); - } else { - ++ntimesapplied; - } - } break; + process_result( + [f, original, &ntimesapplied](Fileinfo& it) mutable { + switch (it.getduptype()) { + case Fileinfo::duptype::DUPTYPE_FIRST_OCCURRENCE: { + original = ⁢ + assert(original->getidentity() >= 0 && + "original file should have positive identity"); + } break; + + case Fileinfo::duptype::DUPTYPE_OUTSIDE_TREE: + // intentional fallthrough + case Fileinfo::duptype::DUPTYPE_WITHIN_SAME_TREE: { + assert(original != NULL); + // double check that "it" shall be ~linked to "src" + assert(it.getidentity() == -original->getidentity() && + "it must be connected to src"); + // everything is in order. we may now hardlink/symlink/remove it. + if (f(it, *original)) { + RDDEBUG(__FILE__ ": Failed to apply function f on it.\n"); + } else { + ++ntimesapplied; + } + } break; - default: - assert("file with bad duptype at this stage. Programming error!" != - nullptr); + default: + assert("file with bad duptype at this stage. Programming error!" != + nullptr); + } } - } + ); + return ntimesapplied; } @@ -140,11 +140,11 @@ Rdutil::deleteduplicates(bool dryrun) const if (dryrun) { const bool outputBname = false; dryrun_helper obj("delete "); - auto ret = applyactiononfile(m_list, obj); + auto ret = applyactiononfile(obj); std::cout.flush(); return ret; } else { - return applyactiononfile(m_list, &Fileinfo::static_deletefile); + return applyactiononfile(&Fileinfo::static_deletefile); } } @@ -154,11 +154,11 @@ Rdutil::makesymlinks(bool dryrun) const if (dryrun) { const bool outputBname = true; dryrun_helper obj("symlink ", " to "); - auto ret = applyactiononfile(m_list, obj); + auto ret = applyactiononfile(obj); std::cout.flush(); return ret; } else { - return applyactiononfile(m_list, &Fileinfo::static_makesymlink); + return applyactiononfile(&Fileinfo::static_makesymlink); } } @@ -168,11 +168,11 @@ Rdutil::makehardlinks(bool dryrun) const if (dryrun) { const bool outputBname = true; dryrun_helper obj("hardlink ", " to "); - const auto ret = applyactiononfile(m_list, obj); + const auto ret = applyactiononfile(obj); std::cout.flush(); return ret; } else - return applyactiononfile(m_list, &Fileinfo::static_makehardlink); + return applyactiononfile(&Fileinfo::static_makehardlink); } // mark files with a unique number @@ -298,7 +298,7 @@ Rdutil::sort_on_depth_and_name(std::size_t index_of_first) } std::size_t -Rdutil::removeIdenticalInodes() +Rdutil::removeIdenticalInodes(bool rememberIdenticalInodes) { // sort list on device and inode. auto cmp = cmpDeviceInode; @@ -315,6 +315,11 @@ Rdutil::removeIdenticalInodes() best->setdeleteflag(false); std::for_each(best + 1, last, [](Fileinfo& f) { f.setdeleteflag(true); }); }); + + if (rememberIdenticalInodes) { + move_deletes_to_duplist(); + } + return cleanup(); } @@ -377,6 +382,7 @@ Rdutil::markduplicates() { const auto cmp = cmpSizeThenBuffer; assert(std::is_sorted(m_list.begin(), m_list.end(), cmp)); + assert(std::is_sorted(m_identlist.begin(), m_identlist.end(), cmpDeviceInode)); // loop over ranges of adjacent elements using Iterator = decltype(m_list.begin()); @@ -384,7 +390,7 @@ Rdutil::markduplicates() m_list.begin(), m_list.end(), cmp, - [](const Iterator first, const Iterator last) { + [this](const Iterator first, const Iterator last) { // size and buffer are equal in [first,last) - all are duplicates! assert(std::distance(first, last) >= 2); @@ -413,7 +419,63 @@ Rdutil::markduplicates() std::for_each(first + 1, last, marker); assert(first->getduptype() == Fileinfo::duptype::DUPTYPE_FIRST_OCCURRENCE); + + if (m_identlist.size() > 0) { + auto np = m_identlist.end(); + m_identindex.push_back(np); + for (auto it = first+1; it < last; it++) { + auto cmp = cmpDeviceInode; + auto bound = std::lower_bound(m_identlist.begin(), m_identlist.end(), *it, cmp); + if (bound != m_identlist.end() && !cmp(*it, *bound)) { + assert(cmp(*it, *bound) == cmp(*bound, *it)); + m_identindex.push_back(bound); + } else { + m_identindex.push_back(np); + } + auto range = find_identical_inodes(it); + std::for_each(range.first, range.second, marker); + } + } }); + assert(m_identlist.size() == 0 || m_identindex.size() == m_list.size()); +} + +std::pair +Rdutil::find_identical_inodes(Rdutil::FileIter listpos) const +{ + assert(m_identindex.size() != 0); + auto index = listpos - m_list.begin(); + auto first = m_identindex[index]; + auto last = first; + for (; last < m_identlist.end() && !cmpDeviceInode(*first, *last); last++) {} + return std::pair(first, last); +} + +template +void +Rdutil::process_result(Function f) const +{ + bool with_remembered_nodes = m_identlist.size() > 0; + for (auto it = m_list.begin(); it < m_list.end(); ++it) { + f(*it); + if (with_remembered_nodes) { + auto range = find_identical_inodes(it); + for (auto range_it = range.first; range_it < range.second; ++range_it) { + f(*range_it); + } + } + } +} + +void +Rdutil::move_deletes_to_duplist() +{ + for (auto it = m_list.begin(); it < m_list.end(); it++) { + if(it->deleteflag()) { + m_identlist.push_back(*it); + } + } + std::sort(m_identlist.begin(), m_identlist.end(), cmpDeviceInode); } std::size_t diff --git a/Rdutil.hh b/Rdutil.hh index b39e2e9..00ce53a 100644 --- a/Rdutil.hh +++ b/Rdutil.hh @@ -46,7 +46,7 @@ public: * rank. * @return number of elements removed */ - std::size_t removeIdenticalInodes(); + std::size_t removeIdenticalInodes(bool rememberIdenticalInodes); /** * remove files with unique size from the list. @@ -121,6 +121,20 @@ public: private: std::vector& m_list; + + std::vector m_identlist; + + typedef std::vector::iterator FileIter; + + std::vector m_identindex; + + void move_deletes_to_duplist(); + + std::pair find_identical_inodes(const FileIter listpos) const; + + template std::size_t applyactiononfile(Function f) const; + + template void process_result(Function f) const; }; #endif diff --git a/rdfind.1 b/rdfind.1 index d390370..e4e1d4d 100644 --- a/rdfind.1 +++ b/rdfind.1 @@ -74,7 +74,12 @@ Follow symlinks. Default is false. .TP .BR \-removeidentinode " " \fItrue\fR|\fIfalse\fR Removes items found which have identical inode and device ID. Default -is true. +is true. Consider using -rememberidentinode true instead of -removeidentinode false. +.TP +.BR \-rememberidentinode " " \fItrue\fR|\fIfalse\fR +Removes but remembers items found which have identical inode and device ID and adds +them again in the final result. Runs faster and reports more accurate statistics +than with -removeidentinode false. Implies -removeidentinode true. Default is false. .TP .BR \-checksum " " \fImd5\fR|\fIsha1\fR|\fIsha256\fR What type of checksum to be used: md5, sha1 or sha256. The default is diff --git a/rdfind.cc b/rdfind.cc index facdda7..b9b630b 100644 --- a/rdfind.cc +++ b/rdfind.cc @@ -60,6 +60,9 @@ usage() << " -followsymlinks true |(false) follow symlinks\n" << " -removeidentinode (true)| false ignore files with nonunique " "device and inode\n" + << " -rememberidentinode true|(false) ignore files with nonunique device " + "and inode but remember them and include them in the result later. " + "Implies -removeidentinode true\n" << " -checksum md5 |(sha1)| sha256\n" << " checksum type\n" << " -deterministic (true)| false makes results independent of order\n" @@ -102,6 +105,7 @@ struct Options bool followsymlinks = false; // follow symlinks bool dryrun = false; // only dryrun, dont destroy anything bool remove_identical_inode = true; // remove files with identical inodes + bool remember_identical_inode = false; // remember files with identical inodes, implies remove_identical_inode bool usemd5 = false; // use md5 checksum to check for similarity bool usesha1 = false; // use sha1 checksum to check for similarity bool usesha256 = false; // use sha256 checksum to check for similarity @@ -164,6 +168,8 @@ parseOptions(Parser& parser) o.dryrun = parser.get_parsed_bool(); } else if (parser.try_parse_bool("-removeidentinode")) { o.remove_identical_inode = parser.get_parsed_bool(); + } else if (parser.try_parse_bool("-rememberidentinode")) { + o.remember_identical_inode = parser.get_parsed_bool(); } else if (parser.try_parse_bool("-deterministic")) { o.deterministic = parser.get_parsed_bool(); } else if (parser.try_parse_string("-checksum")) { @@ -334,9 +340,10 @@ main(int narg, const char* argv[]) // list. gswd.markitems(); - if (o.remove_identical_inode) { + if (o.remove_identical_inode || o.remember_identical_inode) { // remove files with identical devices and inodes from the list - std::cout << dryruntext << "Removed " << gswd.removeIdenticalInodes() + std::cout << dryruntext << "Removed " << (o.remember_identical_inode ? "(but remembered) " : "") + << gswd.removeIdenticalInodes(o.remember_identical_inode) << " files due to nonunique device and inode." << std::endl; } diff --git a/testcases/hardlink_groups.sh b/testcases/hardlink_groups.sh new file mode 100755 index 0000000..4930215 --- /dev/null +++ b/testcases/hardlink_groups.sh @@ -0,0 +1,93 @@ +#!/bin/sh +# Investigate what happen when symlinking fails. +# + + +set -e +. "$(dirname "$0")/common_funcs.sh" + +reset_teststate + +assert_eq() { + message="$1" + actual="$2" + expected="$3" + if [ "$expected" != "$actual" ]; then + dbgecho "ASSERTION FAILED [$message], expected '$expected' but was '$actual'" + exit 1 + fi +} + +make_files() { + head -c 1000000 a + ln a a1 + ln a a2 + cp a A + ln A A1 + ln A A2 + + head -c 1000000 b + ln b b1 + ln b b2 + cp b B + ln B B1 + ln B B2 +} + +verify_result() { + assert_eq "number of files" $(ls | wc -l) 12 + assert_eq "number of hardlinks to a" $(stat -c %h $datadir/a) 6 + assert_eq "number of hardlinks to b" $(stat -c %h $datadir/b) 6 + if cmp --silent $datadir/a $datadir/b; then + dbgecho "Files should be different" + exit 1 + fi +} + +# [rdfind_option...] +verify_reduction_with_options() { + expected_reported="$1" + expected_reduction="$2" + expected_links="$3" + shift 3 + + size_before=$(du -m "$datadir" | cut -f1) + + $rdfind -makehardlinks true "$@" "$datadir" | tee rdfind.out + + assert_eq "reported reduction" "$(cat rdfind.out | grep Totally | sed 's/Totally, \([^ ]*\).*/\1/')" "$expected_reported" + assert_eq "number of creaded hardlinks" "$(cat rdfind.out | grep Making | sed 's/Making \([^ ]*\).*/\1/')" "$expected_links" + + rm rdfind.out results.txt + size_after=$(du -m "$datadir" | cut -f1) + assert_eq "actual reduction" $(( $size_before - $size_after )) "$expected_reduction" +} + +# Default behavior +# requires multiple runs and reports incorrect reduction +make_files +verify_reduction_with_options 2 0 2 -removeidentinode true +verify_reduction_with_options 2 0 2 -removeidentinode true +verify_reduction_with_options 2 2 2 -removeidentinode true +verify_result +verify_reduction_with_options 0 0 0 -removeidentinode true +verify_result + +# removeidentinode false +# requires single run but reports incorrect reduction and number of created links +# also does too much work and keeps repeating it all even when no reduction can be gained +reset_teststate +make_files +verify_reduction_with_options 10 2 10 -removeidentinode false +verify_result +verify_reduction_with_options 10 0 10 -removeidentinode false +verify_result + +# rememberidentinode true +# requires single run and reports correct statistics +reset_teststate +make_files +verify_reduction_with_options 2 2 6 -rememberidentinode true +verify_result +verify_reduction_with_options 0 0 0 -rememberidentinode true +verify_result