// string_pool.cc - Don Yang (uguu.org) // // Library for sharing strings. // // Given a list of strings, we want to be able to convert each string // reference to an (index, prefix_length) pair, where: // - "index" is the index into some array, such that more frequently // occurring strings get lower indices. // - "prefix_length" is the length of the string to copy. If a string // A is a prefix of another string B, both strings should share the // same index, but they will have different prefix_length. // // The ultimate goal is to convert an array of input strings into two // arrays: // - An array of strings. All input strings would be a substring of // one of these strings. // - An array of (index, prefix_length) pairs that reconstructs the // original array of input strings, minimizing the total number of // characters needed to specify all index and prefix_length values // in decimal. // // 06/10/11 #ifndef STRING_POOL_H_ #define STRING_POOL_H_ #include #include #include using namespace std; class StringPool { public: // A single shared item struct Item { Item() : index(0), frequency(0) {} string data; int index; int frequency; }; // List of shared items typedef vector ItemList; StringPool(); ~StringPool(); // Add a single string to pool, returns the shared Item for this // string and update frequency. // // Note that frequency for empty string is never incremented. This // is because empty strings always encode to (index=0, length=0) // pairs, so we don't want empty strings to affect the encoding // efficiency for other strings. Item *Add(const string &data); // Find an existing shared string. If no suitable string is found, // NULL is returned. Frequencies are not updated. const Item *Find(const string &data) const; // Given a string that's already in the pool, canonicalize this // string such that it references the earliest string that shares // the same prefix, and adjust the frequencies accordingly. This // should be used after all strings have been observed, and before // calling Sort(). // // This is needed because the frequency counts are sensitive to the // order which strings are added. For example, given these strings // hogepiyo, hoge, hogefuga, hoge // The resulting frequencies will be: // hogepiyo=2, hogefuga=2 // But the optimal frequencies should have been: // hogefuga=3, hogepiyo=1 // // Calling this function on each "hoge" string will canonicalize // them so that they are counted correctly. Item *Canonicalize(Item *text, int length); // Sort all currently shared Items by frequency. References to all // previously returned Items will remain valid, but index values // inside those Items will change. void Sort(); // Get all shared strings inline const ItemList &strings() const { return strings_; } // Syntactic sugar inline const Item &item(int index) const { return *(strings_[index]); } private: // Functor for comparing strings by prefixes struct CmpPrefix { inline bool operator()(const Item *a, const Item *b) const { return a->data < b->data; } }; typedef set PrefixSet; ItemList strings_; PrefixSet prefixes_; Item empty_; }; #endif // STRING_POOL_H_