| // Copyright 2008 Google Inc. All Rights Reserved. |
| |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // error_diag.cc: Collects device errors for analysis to more accurately |
| // pin-point failed component. |
| |
| #include <set> |
| #include <list> |
| #include <map> |
| |
| // This file must work with autoconf on its public version, |
| // so these includes are correct. |
| #include "error_diag.h" |
| #include "sattypes.h" |
| |
| |
| // DeviceTree constructor. |
| DeviceTree::DeviceTree(string name) |
| : parent_(0), name_(name) { |
| pthread_mutex_init(&device_tree_mutex_, NULL); |
| } |
| |
| // DeviceTree destructor. |
| DeviceTree::~DeviceTree() { |
| // Deallocate subtree devices. |
| for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin(); |
| itr != subdevices_.end(); |
| ++itr) { |
| delete itr->second; |
| } |
| // Deallocate device errors. |
| for (std::list<ErrorInstance*>::iterator itr = errors_.begin(); |
| itr != errors_.end(); |
| ++itr) { |
| delete (*itr); |
| } |
| pthread_mutex_destroy(&device_tree_mutex_); |
| } |
| |
| // Atomically find named device in sub device tree. |
| // Returns 0 if not found |
| DeviceTree *DeviceTree::FindInSubTree(string name) { |
| DeviceTree *ret; |
| pthread_mutex_lock(&device_tree_mutex_); |
| ret = UnlockedFindInSubTree(name); |
| pthread_mutex_unlock(&device_tree_mutex_); |
| return ret; |
| } |
| |
| // Find named device in sub device tree (Non-atomic). |
| // Returns 0 if not found |
| DeviceTree *DeviceTree::UnlockedFindInSubTree(string name) { |
| std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name); |
| if (itr != subdevices_.end()) { |
| return itr->second; |
| } else { |
| // Search sub-tree. |
| for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin(); |
| itr != subdevices_.end(); |
| ++itr) { |
| DeviceTree *result = itr->second->UnlockedFindInSubTree(name); |
| if (result != 0) |
| return result; |
| } |
| return 0; |
| } |
| } |
| |
| // Atomically add error instance to device. |
| void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) { |
| pthread_mutex_lock(&device_tree_mutex_); |
| errors_.push_back(error_instance); |
| pthread_mutex_unlock(&device_tree_mutex_); |
| } |
| |
| // Find or add queried device as necessary. |
| DeviceTree *DeviceTree::FindOrAddDevice(string name) { |
| // Assume named device does not exist and try to insert the device anyway. |
| // No-op if named device already exists. |
| InsertSubDevice(name); |
| // Find and return sub device pointer. |
| return FindInSubTree(name); |
| } |
| |
| // Pretty prints device tree. |
| void DeviceTree::PrettyPrint(string spacer) { |
| for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin(); |
| itr != subdevices_.end(); |
| ++itr) { |
| printf("%s%s\n", spacer.c_str(), itr->first.c_str()); |
| itr->second->PrettyPrint(spacer+spacer); |
| } |
| } |
| |
| // Atomically add sub device. |
| // No-op if named device already exists. |
| void DeviceTree::InsertSubDevice(string name) { |
| pthread_mutex_lock(&device_tree_mutex_); |
| if (UnlockedFindInSubTree(name) != 0) { |
| pthread_mutex_unlock(&device_tree_mutex_); |
| return; |
| } |
| subdevices_[name] = new DeviceTree(name); |
| subdevices_[name]->parent_ = this; |
| pthread_mutex_unlock(&device_tree_mutex_); |
| } |
| |
| |
| // Returns true of any error associated with this device is fatal. |
| bool DeviceTree::KnownBad() { |
| pthread_mutex_lock(&device_tree_mutex_); |
| for (std::list<ErrorInstance*>::iterator itr = errors_.begin(); |
| itr != errors_.end(); |
| ++itr) { |
| if ((*itr)->severity_ == SAT_ERROR_FATAL) { |
| pthread_mutex_unlock(&device_tree_mutex_); |
| return true; |
| } |
| } |
| pthread_mutex_unlock(&device_tree_mutex_); |
| return false; |
| } |
| |
| |
| // ErrorDiag constructor. |
| ErrorDiag::ErrorDiag() { |
| os_ = 0; |
| system_tree_root_ = 0; |
| } |
| |
| // ErrorDiag destructor. |
| ErrorDiag::~ErrorDiag() { |
| if (system_tree_root_) |
| delete system_tree_root_; |
| } |
| |
| // Set platform specific handle and initialize device tree. |
| // Returns false on error. true otherwise. |
| bool ErrorDiag::set_os(OsLayer *os) { |
| os_ = os; |
| return(InitializeDeviceTree()); |
| } |
| |
| // Create and initialize system device tree. |
| // Returns false on error. true otherwise. |
| bool ErrorDiag::InitializeDeviceTree() { |
| system_tree_root_ = new DeviceTree("system_root"); |
| if (!system_tree_root_) |
| return false; |
| return true; |
| } |
| |
| // Logs info about a CECC. |
| // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. |
| int ErrorDiag::AddCeccError(string dimm_string) { |
| DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string); |
| ECCErrorInstance *error = new ECCErrorInstance; |
| if (!error) |
| return -1; |
| error->severity_ = SAT_ERROR_CORRECTABLE; |
| dimm_device->AddErrorInstance(error); |
| return 0; |
| } |
| |
| // Logs info about a UECC. |
| // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. |
| int ErrorDiag::AddUeccError(string dimm_string) { |
| DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string); |
| ECCErrorInstance *error = new ECCErrorInstance; |
| if (!error) |
| return -1; |
| error->severity_ = SAT_ERROR_FATAL; |
| dimm_device->AddErrorInstance(error); |
| return 0; |
| } |
| |
| // Logs info about a miscompare. |
| // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. |
| int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) { |
| DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string); |
| MiscompareErrorInstance *error = new MiscompareErrorInstance; |
| if (!error) |
| return -1; |
| error->severity_ = SAT_ERROR_FATAL; |
| error->addr_ = addr; |
| dimm_device->AddErrorInstance(error); |
| os_->ErrorReport(dimm_string.c_str(), "miscompare", count); |
| return 1; |
| } |
| |
| // Utility Function to translate a virtual address to DIMM number. |
| // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. |
| string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) { |
| char dimm_string[256] = ""; |
| char *vbyteaddr = reinterpret_cast<char*>(addr) + offset; |
| uint64 paddr = os->VirtualToPhysical(vbyteaddr); |
| os->FindDimm(paddr, dimm_string, sizeof(dimm_string)); |
| return string(dimm_string); |
| } |
| |
| // Info about a miscompare from a drive. |
| // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. |
| int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset, |
| void *src_addr, void *dst_addr) { |
| bool mask_hdd_error = false; |
| |
| HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance; |
| if (!error) |
| return -1; |
| |
| error->addr_ = reinterpret_cast<uint64>(src_addr); |
| error->addr2_ = reinterpret_cast<uint64>(dst_addr); |
| error->offset_ = offset; |
| error->block_ = block; |
| |
| string src_dimm = AddressToDimmString(os_, src_addr, offset); |
| string dst_dimm = AddressToDimmString(os_, dst_addr, offset); |
| |
| // DIMM name look up success |
| if (src_dimm.compare("DIMM Unknown")) { |
| // Add src DIMM as possible miscompare cause. |
| DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm); |
| error->causes_.insert(src_dimm_dev); |
| if (src_dimm_dev->KnownBad()) { |
| mask_hdd_error = true; |
| logprintf(5, "Log: supressed %s miscompare report: " |
| "known bad source: %s\n", devicename.c_str(), src_dimm.c_str()); |
| } |
| } |
| if (dst_dimm.compare("DIMM Unknown")) { |
| // Add dst DIMM as possible miscompare cause. |
| DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm); |
| error->causes_.insert(dst_dimm_dev); |
| if (dst_dimm_dev->KnownBad()) { |
| mask_hdd_error = true; |
| logprintf(5, "Log: supressed %s miscompare report: " |
| "known bad destination: %s\n", devicename.c_str(), |
| dst_dimm.c_str()); |
| } |
| } |
| |
| DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename); |
| hdd_dev->AddErrorInstance(error); |
| |
| // HDD error was not masked by bad DIMMs: report bad HDD. |
| if (!mask_hdd_error) { |
| os_->ErrorReport(devicename.c_str(), "miscompare", 1); |
| error->severity_ = SAT_ERROR_FATAL; |
| return 1; |
| } |
| return 0; |
| } |
| |
| // Info about a sector tag miscompare from a drive. |
| // Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise. |
| int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset, |
| int sector, void *src_addr, |
| void *dst_addr) { |
| bool mask_hdd_error = false; |
| |
| HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance; |
| if (!error) |
| return -1; |
| |
| error->addr_ = reinterpret_cast<uint64>(src_addr); |
| error->addr2_ = reinterpret_cast<uint64>(dst_addr); |
| error->sector_ = sector; |
| error->block_ = block; |
| |
| string src_dimm = AddressToDimmString(os_, src_addr, offset); |
| string dst_dimm = AddressToDimmString(os_, dst_addr, offset); |
| |
| // DIMM name look up success |
| if (src_dimm.compare("DIMM Unknown")) { |
| // Add src DIMM as possible miscompare cause. |
| DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm); |
| error->causes_.insert(src_dimm_dev); |
| if (src_dimm_dev->KnownBad()) { |
| mask_hdd_error = true; |
| logprintf(5, "Log: supressed %s sector tag error report: " |
| "known bad source: %s\n", devicename.c_str(), src_dimm.c_str()); |
| } |
| } |
| if (dst_dimm.compare("DIMM Unknown")) { |
| // Add dst DIMM as possible miscompare cause. |
| DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm); |
| error->causes_.insert(dst_dimm_dev); |
| if (dst_dimm_dev->KnownBad()) { |
| mask_hdd_error = true; |
| logprintf(5, "Log: supressed %s sector tag error report: " |
| "known bad destination: %s\n", devicename.c_str(), |
| dst_dimm.c_str()); |
| } |
| } |
| |
| DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename); |
| hdd_dev->AddErrorInstance(error); |
| |
| // HDD error was not masked by bad DIMMs: report bad HDD. |
| if (!mask_hdd_error) { |
| os_->ErrorReport(devicename.c_str(), "sector", 1); |
| error->severity_ = SAT_ERROR_FATAL; |
| return 1; |
| } |
| return 0; |
| } |