| // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "chrome/browser/history/text_database_manager.h" |
| |
| #include "base/compiler_specific.h" |
| #include "base/file_util.h" |
| #include "base/metrics/histogram.h" |
| #include "base/logging.h" |
| #include "base/message_loop.h" |
| #include "base/string_util.h" |
| #include "base/utf_string_conversions.h" |
| #include "chrome/browser/history/history_publisher.h" |
| #include "chrome/browser/history/visit_database.h" |
| #include "chrome/common/mru_cache.h" |
| |
| using base::Time; |
| using base::TimeDelta; |
| using base::TimeTicks; |
| |
| namespace history { |
| |
| namespace { |
| |
| // The number of database files we will be attached to at once. |
| const int kCacheDBSize = 5; |
| |
| std::string ConvertStringForIndexer(const string16& input) { |
| // TODO(evanm): other transformations here? |
| return UTF16ToUTF8(CollapseWhitespace(input, false)); |
| } |
| |
| // Data older than this will be committed to the full text index even if we |
| // haven't gotten a title and/or body. |
| const int kExpirationSec = 20; |
| |
| } // namespace |
| |
| // TextDatabaseManager::ChangeSet ---------------------------------------------- |
| |
| TextDatabaseManager::ChangeSet::ChangeSet() {} |
| |
| TextDatabaseManager::ChangeSet::~ChangeSet() {} |
| |
| // TextDatabaseManager::PageInfo ----------------------------------------------- |
| |
| TextDatabaseManager::PageInfo::PageInfo(URLID url_id, |
| VisitID visit_id, |
| Time visit_time) |
| : url_id_(url_id), |
| visit_id_(visit_id), |
| visit_time_(visit_time) { |
| added_time_ = TimeTicks::Now(); |
| } |
| |
| TextDatabaseManager::PageInfo::~PageInfo() {} |
| |
| void TextDatabaseManager::PageInfo::set_title(const string16& ttl) { |
| if (ttl.empty()) // Make the title nonempty when we set it for EverybodySet. |
| title_ = ASCIIToUTF16(" "); |
| else |
| title_ = ttl; |
| } |
| |
| void TextDatabaseManager::PageInfo::set_body(const string16& bdy) { |
| if (bdy.empty()) // Make the body nonempty when we set it for EverybodySet. |
| body_ = ASCIIToUTF16(" "); |
| else |
| body_ = bdy; |
| } |
| |
| bool TextDatabaseManager::PageInfo::Expired(TimeTicks now) const { |
| return now - added_time_ > TimeDelta::FromSeconds(kExpirationSec); |
| } |
| |
| // TextDatabaseManager --------------------------------------------------------- |
| |
| TextDatabaseManager::TextDatabaseManager(const FilePath& dir, |
| URLDatabase* url_database, |
| VisitDatabase* visit_database) |
| : dir_(dir), |
| url_database_(url_database), |
| visit_database_(visit_database), |
| recent_changes_(RecentChangeList::NO_AUTO_EVICT), |
| transaction_nesting_(0), |
| db_cache_(DBCache::NO_AUTO_EVICT), |
| present_databases_loaded_(false), |
| ALLOW_THIS_IN_INITIALIZER_LIST(factory_(this)), |
| history_publisher_(NULL) { |
| } |
| |
| TextDatabaseManager::~TextDatabaseManager() { |
| if (transaction_nesting_) |
| CommitTransaction(); |
| } |
| |
| // static |
| TextDatabase::DBIdent TextDatabaseManager::TimeToID(Time time) { |
| Time::Exploded exploded; |
| time.UTCExplode(&exploded); |
| |
| // We combine the month and year into a 6-digit number (200801 for |
| // January, 2008). The month is 1-based. |
| return exploded.year * 100 + exploded.month; |
| } |
| |
| // static |
| Time TextDatabaseManager::IDToTime(TextDatabase::DBIdent id) { |
| Time::Exploded exploded; |
| memset(&exploded, 0, sizeof(Time::Exploded)); |
| exploded.year = id / 100; |
| exploded.month = id % 100; |
| return Time::FromUTCExploded(exploded); |
| } |
| |
| bool TextDatabaseManager::Init(const HistoryPublisher* history_publisher) { |
| history_publisher_ = history_publisher; |
| |
| // Start checking recent changes and committing them. |
| ScheduleFlushOldChanges(); |
| return true; |
| } |
| |
| void TextDatabaseManager::BeginTransaction() { |
| transaction_nesting_++; |
| } |
| |
| void TextDatabaseManager::CommitTransaction() { |
| DCHECK(transaction_nesting_); |
| transaction_nesting_--; |
| if (transaction_nesting_) |
| return; // Still more nesting of transactions before committing. |
| |
| // Commit all databases with open transactions on them. |
| for (DBIdentSet::const_iterator i = open_transactions_.begin(); |
| i != open_transactions_.end(); ++i) { |
| DBCache::iterator iter = db_cache_.Get(*i); |
| if (iter == db_cache_.end()) { |
| NOTREACHED() << "All open transactions should be cached."; |
| continue; |
| } |
| iter->second->CommitTransaction(); |
| } |
| open_transactions_.clear(); |
| |
| // Now that the transaction is over, we can expire old connections. |
| db_cache_.ShrinkToSize(kCacheDBSize); |
| } |
| |
| void TextDatabaseManager::InitDBList() { |
| if (present_databases_loaded_) |
| return; |
| |
| present_databases_loaded_ = true; |
| |
| // Find files on disk matching our pattern so we can quickly test for them. |
| FilePath::StringType filepattern(TextDatabase::file_base()); |
| filepattern.append(FILE_PATH_LITERAL("*")); |
| file_util::FileEnumerator enumerator( |
| dir_, false, file_util::FileEnumerator::FILES, filepattern); |
| FilePath cur_file; |
| while (!(cur_file = enumerator.Next()).empty()) { |
| // Convert to the number representing this file. |
| TextDatabase::DBIdent id = TextDatabase::FileNameToID(cur_file); |
| if (id) // Will be 0 on error. |
| present_databases_.insert(id); |
| } |
| } |
| |
| void TextDatabaseManager::AddPageURL(const GURL& url, |
| URLID url_id, |
| VisitID visit_id, |
| Time time) { |
| // Delete any existing page info. |
| RecentChangeList::iterator found = recent_changes_.Peek(url); |
| if (found != recent_changes_.end()) |
| recent_changes_.Erase(found); |
| |
| // Just save this info for later. We will save it when it expires or when all |
| // the data is complete. |
| recent_changes_.Put(url, PageInfo(url_id, visit_id, time)); |
| } |
| |
| void TextDatabaseManager::AddPageTitle(const GURL& url, |
| const string16& title) { |
| RecentChangeList::iterator found = recent_changes_.Peek(url); |
| if (found == recent_changes_.end()) { |
| // This page is not in our cache of recent pages. This is very much an edge |
| // case as normally a title will come in <20 seconds after the page commits, |
| // and TabContents will avoid spamming us with >1 title per page. However, |
| // it could come up if your connection is unhappy, and we don't want to |
| // miss anything. |
| // |
| // To solve this problem, we'll just associate the most recent visit with |
| // the new title and index that using the regular code path. |
| URLRow url_row; |
| if (!url_database_->GetRowForURL(url, &url_row)) |
| return; // URL is unknown, give up. |
| VisitRow visit; |
| if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) |
| return; // No recent visit, give up. |
| |
| if (visit.is_indexed) { |
| // If this page was already indexed, we could have a body that came in |
| // first and we don't want to overwrite it. We could go query for the |
| // current body, or have a special setter for only the title, but this is |
| // not worth it for this edge case. |
| // |
| // It will be almost impossible for the title to take longer than |
| // kExpirationSec yet we got a body in less than that time, since the |
| // title should always come in first. |
| return; |
| } |
| |
| AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, |
| title, string16()); |
| return; // We don't know about this page, give up. |
| } |
| |
| PageInfo& info = found->second; |
| if (info.has_body()) { |
| // This info is complete, write to the database. |
| AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), |
| title, info.body()); |
| recent_changes_.Erase(found); |
| return; |
| } |
| |
| info.set_title(title); |
| } |
| |
| void TextDatabaseManager::AddPageContents(const GURL& url, |
| const string16& body) { |
| RecentChangeList::iterator found = recent_changes_.Peek(url); |
| if (found == recent_changes_.end()) { |
| // This page is not in our cache of recent pages. This means that the page |
| // took more than kExpirationSec to load. Often, this will be the result of |
| // a very slow iframe or other resource on the page that makes us think its |
| // still loading. |
| // |
| // As a fallback, set the most recent visit's contents using the input, and |
| // use the last set title in the URL table as the title to index. |
| URLRow url_row; |
| if (!url_database_->GetRowForURL(url, &url_row)) |
| return; // URL is unknown, give up. |
| VisitRow visit; |
| if (!visit_database_->GetMostRecentVisitForURL(url_row.id(), &visit)) |
| return; // No recent visit, give up. |
| |
| // Use the title from the URL row as the title for the indexing. |
| AddPageData(url, url_row.id(), visit.visit_id, visit.visit_time, |
| url_row.title(), body); |
| return; |
| } |
| |
| PageInfo& info = found->second; |
| if (info.has_title()) { |
| // This info is complete, write to the database. |
| AddPageData(url, info.url_id(), info.visit_id(), info.visit_time(), |
| info.title(), body); |
| recent_changes_.Erase(found); |
| return; |
| } |
| |
| info.set_body(body); |
| } |
| |
| bool TextDatabaseManager::AddPageData(const GURL& url, |
| URLID url_id, |
| VisitID visit_id, |
| Time visit_time, |
| const string16& title, |
| const string16& body) { |
| TextDatabase* db = GetDBForTime(visit_time, true); |
| if (!db) |
| return false; |
| |
| TimeTicks beginning_time = TimeTicks::Now(); |
| |
| // First delete any recently-indexed data for this page. This will delete |
| // anything in the main database, but we don't bother looking through the |
| // archived database. |
| VisitVector visits; |
| visit_database_->GetVisitsForURL(url_id, &visits); |
| size_t our_visit_row_index = visits.size(); |
| for (size_t i = 0; i < visits.size(); i++) { |
| // While we're going trough all the visits, also find our row so we can |
| // avoid another DB query. |
| if (visits[i].visit_id == visit_id) { |
| our_visit_row_index = i; |
| } else if (visits[i].is_indexed) { |
| visits[i].is_indexed = false; |
| visit_database_->UpdateVisitRow(visits[i]); |
| DeletePageData(visits[i].visit_time, url, NULL); |
| } |
| } |
| |
| if (visit_id) { |
| // We're supposed to update the visit database. |
| if (our_visit_row_index >= visits.size()) { |
| NOTREACHED() << "We should always have found a visit when given an ID."; |
| return false; |
| } |
| |
| DCHECK(visit_time == visits[our_visit_row_index].visit_time); |
| |
| // Update the visit database to reference our addition. |
| visits[our_visit_row_index].is_indexed = true; |
| if (!visit_database_->UpdateVisitRow(visits[our_visit_row_index])) |
| return false; |
| } |
| |
| // Now index the data. |
| std::string url_str = URLDatabase::GURLToDatabaseURL(url); |
| bool success = db->AddPageData(visit_time, url_str, |
| ConvertStringForIndexer(title), |
| ConvertStringForIndexer(body)); |
| |
| UMA_HISTOGRAM_TIMES("History.AddFTSData", |
| TimeTicks::Now() - beginning_time); |
| |
| if (history_publisher_) |
| history_publisher_->PublishPageContent(visit_time, url, title, body); |
| |
| return success; |
| } |
| |
| void TextDatabaseManager::DeletePageData(Time time, const GURL& url, |
| ChangeSet* change_set) { |
| TextDatabase::DBIdent db_ident = TimeToID(time); |
| |
| // We want to open the database for writing, but only if it exists. To |
| // achieve this, we check whether it exists by saying we're not going to |
| // write to it (avoiding the autocreation code normally called when writing) |
| // and then access it for writing only if it succeeds. |
| TextDatabase* db = GetDB(db_ident, false); |
| if (!db) |
| return; |
| db = GetDB(db_ident, true); |
| |
| if (change_set) |
| change_set->Add(db_ident); |
| |
| db->DeletePageData(time, URLDatabase::GURLToDatabaseURL(url)); |
| } |
| |
| void TextDatabaseManager::DeleteFromUncommitted( |
| const std::set<GURL>& restrict_urls, Time begin, Time end) { |
| // First find the beginning of the range to delete. Recall that the list |
| // has the most recent item at the beginning. There won't normally be very |
| // many items, so a brute-force search is fine. |
| RecentChangeList::iterator cur = recent_changes_.begin(); |
| if (!end.is_null()) { |
| // Walk from the beginning of the list backwards in time to find the newest |
| // entry that should be deleted. |
| while (cur != recent_changes_.end() && cur->second.visit_time() >= end) |
| ++cur; |
| } |
| |
| // Now delete all visits up to the oldest one we were supposed to delete. |
| // Note that if begin is_null, it will be less than or equal to any other |
| // time. |
| if (restrict_urls.empty()) { |
| while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) |
| cur = recent_changes_.Erase(cur); |
| } else { |
| while (cur != recent_changes_.end() && cur->second.visit_time() >= begin) { |
| if (restrict_urls.find(cur->first) != restrict_urls.end()) |
| cur = recent_changes_.Erase(cur); |
| else |
| ++cur; |
| } |
| } |
| } |
| |
| void TextDatabaseManager::DeleteAll() { |
| DCHECK_EQ(0, transaction_nesting_) << "Calling deleteAll in a transaction."; |
| |
| InitDBList(); |
| |
| // Close all open databases. |
| db_cache_.Clear(); |
| |
| // Now go through and delete all the files. |
| for (DBIdentSet::iterator i = present_databases_.begin(); |
| i != present_databases_.end(); ++i) { |
| FilePath file_name = dir_.Append(TextDatabase::IDToFileName(*i)); |
| file_util::Delete(file_name, false); |
| } |
| } |
| |
| void TextDatabaseManager::OptimizeChangedDatabases( |
| const ChangeSet& change_set) { |
| for (ChangeSet::DBSet::const_iterator i = |
| change_set.changed_databases_.begin(); |
| i != change_set.changed_databases_.end(); ++i) { |
| // We want to open the database for writing, but only if it exists. To |
| // achieve this, we check whether it exists by saying we're not going to |
| // write to it (avoiding the autocreation code normally called when writing) |
| // and then access it for writing only if it succeeds. |
| TextDatabase* db = GetDB(*i, false); |
| if (!db) |
| continue; |
| db = GetDB(*i, true); |
| if (!db) |
| continue; // The file may have changed or something. |
| db->Optimize(); |
| } |
| } |
| |
| void TextDatabaseManager::GetTextMatches( |
| const string16& query, |
| const QueryOptions& options, |
| std::vector<TextDatabase::Match>* results, |
| Time* first_time_searched) { |
| results->clear(); |
| |
| InitDBList(); |
| if (present_databases_.empty()) { |
| // Nothing to search. |
| *first_time_searched = options.begin_time; |
| return; |
| } |
| |
| // Get the query into the proper format for the individual DBs. |
| string16 fts_query16; |
| query_parser_.ParseQuery(query, &fts_query16); |
| std::string fts_query = UTF16ToUTF8(fts_query16); |
| |
| // Need a copy of the options so we can modify the max count for each call |
| // to the individual databases. |
| QueryOptions cur_options(options); |
| |
| // Compute the minimum and maximum values for the identifiers that could |
| // encompass the input time range. |
| TextDatabase::DBIdent min_ident = options.begin_time.is_null() ? |
| *present_databases_.begin() : |
| TimeToID(options.begin_time); |
| TextDatabase::DBIdent max_ident = options.end_time.is_null() ? |
| *present_databases_.rbegin() : |
| TimeToID(options.end_time); |
| |
| // Iterate over the databases from the most recent backwards. |
| bool checked_one = false; |
| TextDatabase::URLSet found_urls; |
| for (DBIdentSet::reverse_iterator i = present_databases_.rbegin(); |
| i != present_databases_.rend(); |
| ++i) { |
| // TODO(brettw) allow canceling the query in the middle. |
| // if (canceled_or_something) |
| // break; |
| |
| // This code is stupid, we just loop until we find the correct starting |
| // time range rather than search in an intelligent way. Users will have a |
| // few dozen files at most, so this should not be an issue. |
| if (*i > max_ident) |
| continue; // Haven't gotten to the time range yet. |
| if (*i < min_ident) |
| break; // Covered all the time range. |
| |
| TextDatabase* cur_db = GetDB(*i, false); |
| if (!cur_db) |
| continue; |
| |
| // Adjust the max count according to how many results we've already got. |
| if (options.max_count) { |
| cur_options.max_count = options.max_count - |
| static_cast<int>(results->size()); |
| } |
| |
| // Since we are going backwards in time, it is always OK to pass the |
| // current first_time_searched, since it will always be smaller than |
| // any previous set. |
| cur_db->GetTextMatches(fts_query, cur_options, |
| results, &found_urls, first_time_searched); |
| checked_one = true; |
| |
| DCHECK(options.max_count == 0 || |
| static_cast<int>(results->size()) <= options.max_count); |
| if (options.max_count && |
| static_cast<int>(results->size()) >= options.max_count) |
| break; // Got the max number of results. |
| } |
| |
| // When there were no databases in the range, we need to fix up the min time. |
| if (!checked_one) |
| *first_time_searched = options.begin_time; |
| } |
| |
| TextDatabase* TextDatabaseManager::GetDB(TextDatabase::DBIdent id, |
| bool for_writing) { |
| DBCache::iterator found_db = db_cache_.Get(id); |
| if (found_db != db_cache_.end()) { |
| if (transaction_nesting_ && for_writing && |
| open_transactions_.find(id) == open_transactions_.end()) { |
| // If we currently have an open transaction, that database is not yet |
| // part of the transaction, and the database will be written to, it needs |
| // to be part of our transaction. |
| found_db->second->BeginTransaction(); |
| open_transactions_.insert(id); |
| } |
| return found_db->second; |
| } |
| |
| // Need to make the database. |
| TextDatabase* new_db = new TextDatabase(dir_, id, for_writing); |
| if (!new_db->Init()) { |
| delete new_db; |
| return NULL; |
| } |
| db_cache_.Put(id, new_db); |
| present_databases_.insert(id); |
| |
| if (transaction_nesting_ && for_writing) { |
| // If we currently have an open transaction and the new database will be |
| // written to, it needs to be part of our transaction. |
| new_db->BeginTransaction(); |
| open_transactions_.insert(id); |
| } |
| |
| // When no transaction is open, allow this new one to kick out an old one. |
| if (!transaction_nesting_) |
| db_cache_.ShrinkToSize(kCacheDBSize); |
| |
| return new_db; |
| } |
| |
| TextDatabase* TextDatabaseManager::GetDBForTime(Time time, |
| bool create_if_necessary) { |
| return GetDB(TimeToID(time), create_if_necessary); |
| } |
| |
| void TextDatabaseManager::ScheduleFlushOldChanges() { |
| factory_.RevokeAll(); |
| MessageLoop::current()->PostDelayedTask(FROM_HERE, factory_.NewRunnableMethod( |
| &TextDatabaseManager::FlushOldChanges), |
| kExpirationSec * Time::kMillisecondsPerSecond); |
| } |
| |
| void TextDatabaseManager::FlushOldChanges() { |
| FlushOldChangesForTime(TimeTicks::Now()); |
| } |
| |
| void TextDatabaseManager::FlushOldChangesForTime(TimeTicks now) { |
| // The end of the list is the oldest, so we just start from there committing |
| // things until we get something too new. |
| RecentChangeList::reverse_iterator i = recent_changes_.rbegin(); |
| while (i != recent_changes_.rend() && i->second.Expired(now)) { |
| AddPageData(i->first, i->second.url_id(), i->second.visit_id(), |
| i->second.visit_time(), i->second.title(), i->second.body()); |
| i = recent_changes_.Erase(i); |
| } |
| |
| ScheduleFlushOldChanges(); |
| } |
| |
| } // namespace history |