// Find long repeated substrings in a file, including overlapping strings. // // Usage: // ./repeated_substrings {input.txt} // // Outputs lines containing "string length count". #include #include #include #include #include #include namespace { // Minimum length to be accepted as a repeated string. static constexpr int kMinimumStringLength = 4; // (string prefix, (list of offsets where string is found)) using StringMap = std::map>; // Compare two string views from back to front, used for sorting strings // by suffix. struct ReverseCompare { bool operator()(const std::string_view &a, const std::string_view &b) const { std::string_view::const_reverse_iterator ia = a.rbegin(); std::string_view::const_reverse_iterator ib = b.rbegin(); while( ia != a.rend() && ib != b.rend() ) { if( *ia < *ib ) return true; if( *ia > *ib ) return false; ++ia; ++ib; } // List longer strings first. if( ib == b.rend() ) return ia != a.rend(); return false; } }; // (substring, frequency) using RepeatedStrings = std::map; // Load data a string. static std::string LoadFile(const char *filename) { std::array buffer; std::string data; FILE *f = fopen(filename, "rb"); if( f == nullptr ) { printf("Error loading %s\n", filename); return data; } while( !feof(f) ) { const size_t size = fread(buffer.data(), 1, buffer.size(), f); data.append(buffer.data(), size); if( size < buffer.size() ) break; } fclose(f); return data; } // Get offsets of all strings at minimum length. static StringMap IndexStringPrefixes(std::string_view data) { StringMap string_map; for(int i = 0; i <= static_cast(data.size()) - kMinimumStringLength; i++) { auto p = string_map.insert(StringMap::value_type( data.substr(i, kMinimumStringLength), {})); p.first->second.push_back(i); } return string_map; } // Remove substrings that appeared only once. static void RemoveUniqueStrings(StringMap *string_map) { for(StringMap::iterator i = string_map->begin(); i != string_map->end();) { StringMap::iterator d = i; ++i; if( d->second.size() <= 1 ) string_map->erase(d); } } // Escape unprintable characters in string. static std::string Escape(std::string_view s) { std::string r; for(char c : s) { switch( c ) { case '\n': r.append("\\n"); break; case '\\': r.append("\\\\"); break; case '\"': r.append("\\\""); break; default: if( c < 0x20 || c >= 0x7f ) { std::array buffer; sprintf(buffer.data(), "\\x%02x", c & 0xff); r.append(buffer.data()); } else { r.push_back(c); } break; } } return r; } } // namespace int main(int argc, char **argv) { if( argc != 2 ) return printf("%s {input.txt}\n", *argv); // Load and index input. const std::string data = LoadFile(argv[1]); StringMap string_map = IndexStringPrefixes(data); RemoveUniqueStrings(&string_map); RepeatedStrings repeated_strings; while( !string_map.empty() ) { StringMap extended_string_map; for(const auto &[key, offsets] : string_map) { // Try to extend key by one character. StringMap::iterator last_inserted = extended_string_map.end(); for(int i : offsets) { if( i + key.size() >= data.size() ) continue; auto p = extended_string_map.insert(StringMap::value_type( std::string_view(data).substr(i, key.size() + 1), {})); last_inserted = p.first; last_inserted->second.push_back(i); } // If we managed to extend all substrings in the set by the // same character, the last inserted key should have the same // number of offsets as what we started with. // // If any of the substrings had a different suffix, or if we // weren't able to extend the substring due to end of input, // we would be inserting multiple keys and split the offset // counts among those. When that happens, we know the // previous key was the longest substring with that prefix. if( last_inserted->second.size() < offsets.size() ) repeated_strings.insert(std::make_pair(key, offsets.size())); } RemoveUniqueStrings(&extended_string_map); string_map.swap(extended_string_map); } // Find longest string length. int width1 = 0; int width2 = 1; int width2_p10 = 10; for(const auto &[key, count] : repeated_strings) { const std::string s(Escape(key)); if( width1 < static_cast(s.size()) ) width1 = static_cast(s.size()); while( width2_p10 <= static_cast(key.size()) ) { width2++; width2_p10 *= 10; } } // Output strings with count. std::string_view previous_key; int previous_count = 0; for(const auto &[key, count] : repeated_strings) { // Ignore strings that are suffixes of something that was // already printed, unless we are able to get more matches with // this shorter suffix. if( count == previous_count && previous_key.ends_with(key) ) continue; const std::string s(Escape(key)); printf("\"%s\"", s.c_str()); printf("%s %-*d %d\n", std::string(width1 + 1 - s.size(), ' ').c_str(), width2, static_cast(key.size()), count); previous_key = key; previous_count = count; } return 0; }