// string_pool.cc - Don Yang (uguu.org) // // 06/10/11 #include"string_pool.h" #include #include #include"util.h" using namespace std; namespace { // Functor for comparing items by (frequency, string contents) struct CmpFrequency { inline bool operator()(const StringPool::Item *a, const StringPool::Item *b) const { if( a->frequency != b->frequency ) return a->frequency > b->frequency; return a->data < b->data; } }; // Check if a string has the requested prefix static inline bool HasPrefix(const string &s, const string &prefix) { if( s.size() < prefix.size() ) return false; return s.compare(0, prefix.size(), prefix) == 0; } } // namespace StringPool::StringPool() {} StringPool::~StringPool() { DeletePtrElements(&strings_); } // Add a string to pool StringPool::Item *StringPool::Add(const string &data) { Item key; key.data = data; PrefixSet::iterator next = prefixes_.lower_bound(&key); if( next != prefixes_.end() ) { // Found an existing item greater than or equal to input, check // if input is a prefix of the existing item. Item *item = *next; if( HasPrefix(item->data, data) ) { // Update existing item if( !data.empty() ) item->frequency++; return item; } } if( next != prefixes_.begin() ) { // Found an existing item less than input, check if existing // item is a prefix of input. PrefixSet::iterator previous = next; Item *item = *--previous; if( HasPrefix(data, item->data) ) { // Update existing item assert(!data.empty()); item->data.swap(key.data); item->frequency++; return item; } } // Now we have an iterator position where the input would fit, and // we have verified that the input can't be folded into the // existing items on both sides. Insert a new item. Item *new_item = new Item; new_item->data.swap(key.data); new_item->index = static_cast(strings_.size()); new_item->frequency = (new_item->data.empty() ? 0 : 1); strings_.push_back(new_item); prefixes_.insert(next, new_item); return new_item; } // Find an existing string const StringPool::Item *StringPool::Find(const string &data) const { Item key; key.data = data; PrefixSet::const_iterator i = prefixes_.lower_bound(&key); if( i == prefixes_.end() ) return NULL; Item *item = *i; return HasPrefix(item->data, data) ? item : NULL; } // Canonicalize a single string and update frequencies StringPool::Item *StringPool::Canonicalize(Item *text, int length) { Item key; assert(length <= static_cast(text->data.size())); key.data.assign(text->data, 0, length); PrefixSet::iterator i = prefixes_.lower_bound(&key); assert(i != prefixes_.end()); Item *item = *i; assert(length <= static_cast(item->data.size())); assert(item->data.compare(0, length, text->data.substr(0, length)) == 0); if( length > 0 ) { text->frequency--; item->frequency++; } return item; } // Sort items by frequency void StringPool::Sort() { sort(strings_.begin(), strings_.end(), CmpFrequency()); int index = 0; for(ItemList::iterator i = strings_.begin(); i != strings_.end(); ++i) (*i)->index = index++; }