| // Copyright 2006 Google Inc. All Rights Reserved. |
| |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // sat.cc : a stress test for stressful testing |
| |
| // stressapptest (or SAT, from Stressful Application Test) is a test |
| // designed to stress the system, as well as provide a comprehensive |
| // memory interface test. |
| |
| // stressapptest can be run using memory only, or using many system components. |
| |
| #include <errno.h> |
| #include <pthread.h> |
| #include <signal.h> |
| #include <stdarg.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <unistd.h> |
| |
| #include <sys/stat.h> |
| #include <sys/times.h> |
| |
| // #define __USE_GNU |
| // #define __USE_LARGEFILE64 |
| #include <fcntl.h> |
| |
| #include <list> |
| #include <string> |
| |
| // This file must work with autoconf on its public version, |
| // so these includes are correct. |
| #include "disk_blocks.h" |
| #include "logger.h" |
| #include "os.h" |
| #include "sat.h" |
| #include "sattypes.h" |
| #include "worker.h" |
| |
| // stressapptest versioning here. |
| #ifndef PACKAGE_VERSION |
| static const char* kVersion = "1.0.0"; |
| #else |
| static const char* kVersion = PACKAGE_VERSION; |
| #endif |
| |
| // Global stressapptest reference, for use by signal handler. |
| // This makes Sat objects not safe for multiple instances. |
| namespace { |
| Sat *g_sat = NULL; |
| |
| // Signal handler for catching break or kill. |
| // |
| // This must be installed after g_sat is assigned and while there is a single |
| // thread. |
| // |
| // This must be uninstalled while there is only a single thread, and of course |
| // before g_sat is cleared or deleted. |
| void SatHandleBreak(int signal) { |
| g_sat->Break(); |
| } |
| } |
| |
| // Opens the logfile for writing if necessary |
| bool Sat::InitializeLogfile() { |
| // Open logfile. |
| if (use_logfile_) { |
| logfile_ = open(logfilename_, |
| O_WRONLY | O_CREAT | O_DSYNC, |
| S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); |
| if (logfile_ < 0) { |
| printf("Fatal Error: cannot open file %s for logging\n", |
| logfilename_); |
| bad_status(); |
| return false; |
| } |
| // We seek to the end once instead of opening in append mode because no |
| // other processes should be writing to it while this one exists. |
| if (lseek(logfile_, 0, SEEK_END) == -1) { |
| printf("Fatal Error: cannot seek to end of logfile (%s)\n", |
| logfilename_); |
| bad_status(); |
| return false; |
| } |
| Logger::GlobalLogger()->SetLogFd(logfile_); |
| } |
| return true; |
| } |
| |
| // Check that the environment is known and safe to run on. |
| // Return 1 if good, 0 if unsuppported. |
| bool Sat::CheckEnvironment() { |
| // Check that this is not a debug build. Debug builds lack |
| // enough performance to stress the system. |
| #if !defined NDEBUG |
| if (run_on_anything_) { |
| logprintf(1, "Log: Running DEBUG version of SAT, " |
| "with significantly reduced coverage.\n"); |
| } else { |
| logprintf(0, "Process Error: Running DEBUG version of SAT, " |
| "with significantly reduced coverage.\n"); |
| logprintf(0, "Log: Command line option '-A' bypasses this error.\n"); |
| bad_status(); |
| return false; |
| } |
| #elif !defined CHECKOPTS |
| #error Build system regression - COPTS disregarded. |
| #endif |
| |
| // Use all CPUs if nothing is specified. |
| if (memory_threads_ == -1) { |
| memory_threads_ = os_->num_cpus(); |
| logprintf(7, "Log: Defaulting to %d copy threads\n", memory_threads_); |
| } |
| |
| // Use all memory if no size is specified. |
| if (size_mb_ == 0) |
| size_mb_ = os_->FindFreeMemSize() / kMegabyte; |
| size_ = static_cast<int64>(size_mb_) * kMegabyte; |
| |
| // Autodetect file locations. |
| if (findfiles_ && (file_threads_ == 0)) { |
| // Get a space separated sting of disk locations. |
| list<string> locations = os_->FindFileDevices(); |
| |
| // Extract each one. |
| while (!locations.empty()) { |
| // Copy and remove the disk name. |
| string disk = locations.back(); |
| locations.pop_back(); |
| |
| logprintf(12, "Log: disk at %s\n", disk.c_str()); |
| file_threads_++; |
| filename_.push_back(disk + "/sat_disk.a"); |
| file_threads_++; |
| filename_.push_back(disk + "/sat_disk.b"); |
| } |
| } |
| |
| // We'd better have some memory by this point. |
| if (size_ < 1) { |
| logprintf(0, "Process Error: No memory found to test.\n"); |
| bad_status(); |
| return false; |
| } |
| |
| if (tag_mode_ && ((file_threads_ > 0) || |
| (disk_threads_ > 0) || |
| (net_threads_ > 0))) { |
| logprintf(0, "Process Error: Memory tag mode incompatible " |
| "with disk/network DMA.\n"); |
| bad_status(); |
| return false; |
| } |
| |
| // If platform is 32 bit Xeon, floor memory size to multiple of 4. |
| if (address_mode_ == 32) { |
| size_mb_ = (size_mb_ / 4) * 4; |
| size_ = size_mb_ * kMegabyte; |
| logprintf(1, "Log: Flooring memory allocation to multiple of 4: %lldMB\n", |
| size_mb_); |
| } |
| |
| // Check if this system is on the whitelist for supported systems. |
| if (!os_->IsSupported()) { |
| if (run_on_anything_) { |
| logprintf(1, "Log: Unsupported system. Running with reduced coverage.\n"); |
| // This is ok, continue on. |
| } else { |
| logprintf(0, "Process Error: Unsupported system, " |
| "no error reporting available\n"); |
| logprintf(0, "Log: Command line option '-A' bypasses this error.\n"); |
| bad_status(); |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| // Allocates memory to run the test on |
| bool Sat::AllocateMemory() { |
| // Allocate our test memory. |
| bool result = os_->AllocateTestMem(size_, paddr_base_); |
| if (!result) { |
| logprintf(0, "Process Error: failed to allocate memory\n"); |
| bad_status(); |
| return false; |
| } |
| return true; |
| } |
| |
| // Sets up access to data patterns |
| bool Sat::InitializePatterns() { |
| // Initialize pattern data. |
| patternlist_ = new PatternList(); |
| if (!patternlist_) { |
| logprintf(0, "Process Error: failed to allocate patterns\n"); |
| bad_status(); |
| return false; |
| } |
| if (!patternlist_->Initialize()) { |
| logprintf(0, "Process Error: failed to initialize patternlist\n"); |
| bad_status(); |
| return false; |
| } |
| return true; |
| } |
| |
| // Get any valid page, no tag specified. |
| bool Sat::GetValid(struct page_entry *pe) { |
| return GetValid(pe, kDontCareTag); |
| } |
| |
| |
| // Fetch and return empty and full pages into the empty and full pools. |
| bool Sat::GetValid(struct page_entry *pe, int32 tag) { |
| bool result = false; |
| // Get valid page depending on implementation. |
| if (pe_q_implementation_ == SAT_FINELOCK) |
| result = finelock_q_->GetValid(pe, tag); |
| else if (pe_q_implementation_ == SAT_ONELOCK) |
| result = valid_->PopRandom(pe); |
| |
| if (result) { |
| pe->addr = os_->PrepareTestMem(pe->offset, page_length_); // Map it. |
| |
| // Tag this access and current pattern. |
| pe->ts = os_->GetTimestamp(); |
| pe->lastpattern = pe->pattern; |
| |
| return (pe->addr != 0); // Return success or failure. |
| } |
| return false; |
| } |
| |
| bool Sat::PutValid(struct page_entry *pe) { |
| if (pe->addr != 0) |
| os_->ReleaseTestMem(pe->addr, pe->offset, page_length_); // Unmap the page. |
| pe->addr = 0; |
| |
| // Put valid page depending on implementation. |
| if (pe_q_implementation_ == SAT_FINELOCK) |
| return finelock_q_->PutValid(pe); |
| else if (pe_q_implementation_ == SAT_ONELOCK) |
| return valid_->Push(pe); |
| else |
| return false; |
| } |
| |
| // Get an empty page with any tag. |
| bool Sat::GetEmpty(struct page_entry *pe) { |
| return GetEmpty(pe, kDontCareTag); |
| } |
| |
| bool Sat::GetEmpty(struct page_entry *pe, int32 tag) { |
| bool result = false; |
| // Get empty page depending on implementation. |
| if (pe_q_implementation_ == SAT_FINELOCK) |
| result = finelock_q_->GetEmpty(pe, tag); |
| else if (pe_q_implementation_ == SAT_ONELOCK) |
| result = empty_->PopRandom(pe); |
| |
| if (result) { |
| pe->addr = os_->PrepareTestMem(pe->offset, page_length_); // Map it. |
| return (pe->addr != 0); // Return success or failure. |
| } |
| return false; |
| } |
| |
| bool Sat::PutEmpty(struct page_entry *pe) { |
| if (pe->addr != 0) |
| os_->ReleaseTestMem(pe->addr, pe->offset, page_length_); // Unmap the page. |
| pe->addr = 0; |
| |
| // Put empty page depending on implementation. |
| if (pe_q_implementation_ == SAT_FINELOCK) |
| return finelock_q_->PutEmpty(pe); |
| else if (pe_q_implementation_ == SAT_ONELOCK) |
| return empty_->Push(pe); |
| else |
| return false; |
| } |
| |
| // Set up the bitmap of physical pages in case we want to see which pages were |
| // accessed under this run of SAT. |
| void Sat::AddrMapInit() { |
| if (!do_page_map_) |
| return; |
| // Find about how much physical mem is in the system. |
| // TODO(nsanders): Find some way to get the max |
| // and min phys addr in the system. |
| uint64 maxsize = os_->FindFreeMemSize() * 4; |
| sat_assert(maxsize != 0); |
| |
| // Make a bitmask of this many pages. Assume that the memory is relatively |
| // zero based. This is true on x86, typically. |
| // This is one bit per page. |
| uint64 arraysize = maxsize / 4096 / 8; |
| unsigned char *bitmap = new unsigned char[arraysize]; |
| sat_assert(bitmap); |
| |
| // Mark every page as 0, not seen. |
| memset(bitmap, 0, arraysize); |
| |
| page_bitmap_size_ = maxsize; |
| page_bitmap_ = bitmap; |
| } |
| |
| // Add the 4k pages in this block to the array of pages SAT has seen. |
| void Sat::AddrMapUpdate(struct page_entry *pe) { |
| if (!do_page_map_) |
| return; |
| |
| // Go through 4k page blocks. |
| uint64 arraysize = page_bitmap_size_ / 4096 / 8; |
| |
| char *base = reinterpret_cast<char*>(pe->addr); |
| for (int i = 0; i < page_length_; i += 4096) { |
| uint64 paddr = os_->VirtualToPhysical(base + i); |
| |
| uint32 offset = paddr / 4096 / 8; |
| unsigned char mask = 1 << ((paddr / 4096) % 8); |
| |
| if (offset >= arraysize) { |
| logprintf(0, "Process Error: Physical address %#llx is " |
| "greater than expected %#llx.\n", |
| paddr, page_bitmap_size_); |
| sat_assert(0); |
| } |
| page_bitmap_[offset] |= mask; |
| } |
| } |
| |
| // Print out the physical memory ranges that SAT has accessed. |
| void Sat::AddrMapPrint() { |
| if (!do_page_map_) |
| return; |
| |
| uint64 pages = page_bitmap_size_ / 4096; |
| |
| uint64 last_page = 0; |
| bool valid_range = false; |
| |
| logprintf(4, "Log: Printing tested physical ranges.\n"); |
| |
| for (uint64 i = 0; i < pages; i ++) { |
| int offset = i / 8; |
| unsigned char mask = 1 << (i % 8); |
| |
| bool touched = page_bitmap_[offset] & mask; |
| if (touched && !valid_range) { |
| valid_range = true; |
| last_page = i * 4096; |
| } else if (!touched && valid_range) { |
| valid_range = false; |
| logprintf(4, "Log: %#016llx - %#016llx\n", last_page, (i * 4096) - 1); |
| } |
| } |
| logprintf(4, "Log: Done printing physical ranges.\n"); |
| } |
| |
| // Initializes page lists and fills pages with data patterns. |
| bool Sat::InitializePages() { |
| int result = 1; |
| // Calculate needed page totals. |
| int64 neededpages = memory_threads_ + |
| invert_threads_ + |
| check_threads_ + |
| net_threads_ + |
| file_threads_; |
| |
| // Empty-valid page ratio is adjusted depending on queue implementation. |
| // since fine-grain-locked queue keeps both valid and empty entries in the |
| // same queue and randomly traverse to find pages, the empty-valid ratio |
| // should be more even. |
| if (pe_q_implementation_ == SAT_FINELOCK) |
| freepages_ = pages_ / 5 * 2; // Mark roughly 2/5 of all pages as Empty. |
| else |
| freepages_ = (pages_ / 100) + (2 * neededpages); |
| |
| if (freepages_ < neededpages) { |
| logprintf(0, "Process Error: freepages < neededpages.\n"); |
| logprintf(1, "Stats: Total: %lld, Needed: %lld, Marked free: %lld\n", |
| static_cast<int64>(pages_), |
| static_cast<int64>(neededpages), |
| static_cast<int64>(freepages_)); |
| bad_status(); |
| return false; |
| } |
| |
| if (freepages_ > pages_/2) { |
| logprintf(0, "Process Error: not enough pages for IO\n"); |
| logprintf(1, "Stats: Total: %lld, Needed: %lld, Available: %lld\n", |
| static_cast<int64>(pages_), |
| static_cast<int64>(freepages_), |
| static_cast<int64>(pages_/2)); |
| bad_status(); |
| return false; |
| } |
| logprintf(12, "Log: Allocating pages, Total: %lld Free: %lld\n", |
| pages_, |
| freepages_); |
| |
| // Initialize page locations. |
| for (int64 i = 0; i < pages_; i++) { |
| struct page_entry pe; |
| init_pe(&pe); |
| pe.offset = i * page_length_; |
| result &= PutEmpty(&pe); |
| } |
| |
| if (!result) { |
| logprintf(0, "Process Error: while initializing empty_ list\n"); |
| bad_status(); |
| return false; |
| } |
| |
| // Fill valid pages with test patterns. |
| // Use fill threads to do this. |
| WorkerStatus fill_status; |
| WorkerVector fill_vector; |
| |
| logprintf(12, "Starting Fill threads: %d threads, %d pages\n", |
| fill_threads_, pages_); |
| // Initialize the fill threads. |
| for (int i = 0; i < fill_threads_; i++) { |
| FillThread *thread = new FillThread(); |
| thread->InitThread(i, this, os_, patternlist_, &fill_status); |
| if (i != fill_threads_ - 1) { |
| logprintf(12, "Starting Fill Threads %d: %d pages\n", |
| i, pages_ / fill_threads_); |
| thread->SetFillPages(pages_ / fill_threads_); |
| // The last thread finishes up all the leftover pages. |
| } else { |
| logprintf(12, "Starting Fill Threads %d: %d pages\n", |
| i, pages_ - pages_ / fill_threads_ * i); |
| thread->SetFillPages(pages_ - pages_ / fill_threads_ * i); |
| } |
| fill_vector.push_back(thread); |
| } |
| |
| // Spawn the fill threads. |
| fill_status.Initialize(); |
| for (WorkerVector::const_iterator it = fill_vector.begin(); |
| it != fill_vector.end(); ++it) |
| (*it)->SpawnThread(); |
| |
| // Reap the finished fill threads. |
| for (WorkerVector::const_iterator it = fill_vector.begin(); |
| it != fill_vector.end(); ++it) { |
| (*it)->JoinThread(); |
| if ((*it)->GetStatus() != 1) { |
| logprintf(0, "Thread %d failed with status %d at %.2f seconds\n", |
| (*it)->ThreadID(), (*it)->GetStatus(), |
| (*it)->GetRunDurationUSec() * 1.0/1000000); |
| bad_status(); |
| return false; |
| } |
| delete (*it); |
| } |
| fill_vector.clear(); |
| fill_status.Destroy(); |
| logprintf(12, "Log: Done filling pages.\n"); |
| logprintf(12, "Log: Allocating pages.\n"); |
| |
| AddrMapInit(); |
| |
| // Initialize page locations. |
| for (int64 i = 0; i < pages_; i++) { |
| struct page_entry pe; |
| // Only get valid pages with uninitialized tags here. |
| char buf[256]; |
| if (GetValid(&pe, kInvalidTag)) { |
| int64 paddr = os_->VirtualToPhysical(pe.addr); |
| int32 region = os_->FindRegion(paddr); |
| |
| os_->FindDimm(paddr, buf, sizeof(buf)); |
| if (i < 256) { |
| logprintf(12, "Log: address: %#llx, %s\n", paddr, buf); |
| } |
| region_[region]++; |
| pe.paddr = paddr; |
| pe.tag = 1 << region; |
| region_mask_ |= pe.tag; |
| |
| // Generate a physical region map |
| AddrMapUpdate(&pe); |
| |
| // Note: this does not allocate free pages among all regions |
| // fairly. However, with large enough (thousands) random number |
| // of pages being marked free in each region, the free pages |
| // count in each region end up pretty balanced. |
| if (i < freepages_) { |
| result &= PutEmpty(&pe); |
| } else { |
| result &= PutValid(&pe); |
| } |
| } else { |
| logprintf(0, "Log: didn't tag all pages. %d - %d = %d\n", |
| pages_, i, pages_ - i); |
| return false; |
| } |
| } |
| logprintf(12, "Log: Done allocating pages.\n"); |
| |
| AddrMapPrint(); |
| |
| for (int i = 0; i < 32; i++) { |
| if (region_mask_ & (1 << i)) { |
| region_count_++; |
| logprintf(12, "Log: Region %d: %d.\n", i, region_[i]); |
| } |
| } |
| logprintf(5, "Log: Region mask: 0x%x\n", region_mask_); |
| |
| return true; |
| } |
| |
| // Print SAT version info. |
| bool Sat::PrintVersion() { |
| logprintf(1, "Stats: SAT revision %s, %d bit binary\n", |
| kVersion, address_mode_); |
| logprintf(5, "Log: %s from %s\n", Timestamp(), BuildChangelist()); |
| |
| return true; |
| } |
| |
| |
| // Initializes the resources that SAT needs to run. |
| // This needs to be called before Run(), and after ParseArgs(). |
| // Returns true on success, false on error, and will exit() on help message. |
| bool Sat::Initialize() { |
| g_sat = this; |
| |
| // Initializes sync'd log file to ensure output is saved. |
| if (!InitializeLogfile()) |
| return false; |
| Logger::GlobalLogger()->StartThread(); |
| |
| logprintf(5, "Log: Commandline - %s\n", cmdline_.c_str()); |
| PrintVersion(); |
| |
| std::map<std::string, std::string> options; |
| |
| GoogleOsOptions(&options); |
| |
| // Initialize OS/Hardware interface. |
| os_ = OsLayerFactory(options); |
| if (!os_) { |
| bad_status(); |
| return false; |
| } |
| |
| if (min_hugepages_mbytes_ > 0) |
| os_->SetMinimumHugepagesSize(min_hugepages_mbytes_ * kMegabyte); |
| |
| if (!os_->Initialize()) { |
| logprintf(0, "Process Error: Failed to initialize OS layer\n"); |
| bad_status(); |
| delete os_; |
| return false; |
| } |
| |
| // Checks that OS/Build/Platform is supported. |
| if (!CheckEnvironment()) |
| return false; |
| |
| if (error_injection_) |
| os_->set_error_injection(true); |
| |
| // Run SAT in monitor only mode, do not continue to allocate resources. |
| if (monitor_mode_) { |
| logprintf(5, "Log: Running in monitor-only mode. " |
| "Will not allocate any memory nor run any stress test. " |
| "Only polling ECC errors.\n"); |
| return true; |
| } |
| |
| // Allocate the memory to test. |
| if (!AllocateMemory()) |
| return false; |
| |
| logprintf(5, "Stats: Starting SAT, %dM, %d seconds\n", |
| static_cast<int>(size_/kMegabyte), |
| runtime_seconds_); |
| |
| if (!InitializePatterns()) |
| return false; |
| |
| // Initialize memory allocation. |
| pages_ = size_ / page_length_; |
| |
| // Allocate page queue depending on queue implementation switch. |
| if (pe_q_implementation_ == SAT_FINELOCK) { |
| finelock_q_ = new FineLockPEQueue(pages_, page_length_); |
| if (finelock_q_ == NULL) |
| return false; |
| finelock_q_->set_os(os_); |
| os_->set_err_log_callback(finelock_q_->get_err_log_callback()); |
| } else if (pe_q_implementation_ == SAT_ONELOCK) { |
| empty_ = new PageEntryQueue(pages_); |
| valid_ = new PageEntryQueue(pages_); |
| if ((empty_ == NULL) || (valid_ == NULL)) |
| return false; |
| } |
| |
| if (!InitializePages()) { |
| logprintf(0, "Process Error: Initialize Pages failed\n"); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| // Constructor and destructor. |
| Sat::Sat() { |
| // Set defaults, command line might override these. |
| runtime_seconds_ = 20; |
| page_length_ = kSatPageSize; |
| disk_pages_ = kSatDiskPage; |
| pages_ = 0; |
| size_mb_ = 0; |
| size_ = size_mb_ * kMegabyte; |
| min_hugepages_mbytes_ = 0; |
| freepages_ = 0; |
| paddr_base_ = 0; |
| |
| user_break_ = false; |
| verbosity_ = 8; |
| Logger::GlobalLogger()->SetVerbosity(verbosity_); |
| strict_ = 1; |
| warm_ = 0; |
| run_on_anything_ = 0; |
| use_logfile_ = 0; |
| logfile_ = 0; |
| // Detect 32/64 bit binary. |
| void *pvoid = 0; |
| address_mode_ = sizeof(pvoid) * 8; |
| error_injection_ = false; |
| crazy_error_injection_ = false; |
| max_errorcount_ = 0; // Zero means no early exit. |
| stop_on_error_ = false; |
| error_poll_ = true; |
| findfiles_ = false; |
| |
| do_page_map_ = false; |
| page_bitmap_ = 0; |
| page_bitmap_size_ = 0; |
| |
| // Cache coherency data initialization. |
| cc_test_ = false; // Flag to trigger cc threads. |
| cc_cacheline_count_ = 2; // Two datastructures of cache line size. |
| cc_inc_count_ = 1000; // Number of times to increment the shared variable. |
| cc_cacheline_data_ = 0; // Cache Line size datastructure. |
| |
| sat_assert(0 == pthread_mutex_init(&worker_lock_, NULL)); |
| file_threads_ = 0; |
| net_threads_ = 0; |
| listen_threads_ = 0; |
| // Default to autodetect number of cpus, and run that many threads. |
| memory_threads_ = -1; |
| invert_threads_ = 0; |
| fill_threads_ = 8; |
| check_threads_ = 0; |
| cpu_stress_threads_ = 0; |
| disk_threads_ = 0; |
| total_threads_ = 0; |
| |
| region_mask_ = 0; |
| region_count_ = 0; |
| for (int i = 0; i < 32; i++) { |
| region_[i] = 0; |
| } |
| region_mode_ = 0; |
| |
| errorcount_ = 0; |
| statuscount_ = 0; |
| |
| valid_ = 0; |
| empty_ = 0; |
| finelock_q_ = 0; |
| // Default to use fine-grain lock for better performance. |
| pe_q_implementation_ = SAT_FINELOCK; |
| |
| os_ = 0; |
| patternlist_ = 0; |
| logfilename_[0] = 0; |
| |
| read_block_size_ = 512; |
| write_block_size_ = -1; |
| segment_size_ = -1; |
| cache_size_ = -1; |
| blocks_per_segment_ = -1; |
| read_threshold_ = -1; |
| write_threshold_ = -1; |
| non_destructive_ = 1; |
| monitor_mode_ = 0; |
| tag_mode_ = 0; |
| random_threads_ = 0; |
| |
| pause_delay_ = 600; |
| pause_duration_ = 15; |
| } |
| |
| // Destructor. |
| Sat::~Sat() { |
| // We need to have called Cleanup() at this point. |
| // We should probably enforce this. |
| } |
| |
| |
| #define ARG_KVALUE(argument, variable, value) \ |
| if (!strcmp(argv[i], argument)) { \ |
| variable = value; \ |
| continue; \ |
| } |
| |
| #define ARG_IVALUE(argument, variable) \ |
| if (!strcmp(argv[i], argument)) { \ |
| i++; \ |
| if (i < argc) \ |
| variable = strtoull(argv[i], NULL, 0); \ |
| continue; \ |
| } |
| |
| #define ARG_SVALUE(argument, variable) \ |
| if (!strcmp(argv[i], argument)) { \ |
| i++; \ |
| if (i < argc) \ |
| snprintf(variable, sizeof(variable), "%s", argv[i]); \ |
| continue; \ |
| } |
| |
| // Configures SAT from command line arguments. |
| // This will call exit() given a request for |
| // self-documentation or unexpected args. |
| bool Sat::ParseArgs(int argc, char **argv) { |
| int i; |
| uint64 filesize = page_length_ * disk_pages_; |
| |
| // Parse each argument. |
| for (i = 1; i < argc; i++) { |
| // Switch to fall back to corase-grain-lock queue. (for benchmarking) |
| ARG_KVALUE("--coarse_grain_lock", pe_q_implementation_, SAT_ONELOCK); |
| |
| // Set number of megabyte to use. |
| ARG_IVALUE("-M", size_mb_); |
| |
| // Set minimum megabytes of hugepages to require. |
| ARG_IVALUE("-H", min_hugepages_mbytes_); |
| |
| // Set number of seconds to run. |
| ARG_IVALUE("-s", runtime_seconds_); |
| |
| // Set number of memory copy threads. |
| ARG_IVALUE("-m", memory_threads_); |
| |
| // Set number of memory invert threads. |
| ARG_IVALUE("-i", invert_threads_); |
| |
| // Set number of check-only threads. |
| ARG_IVALUE("-c", check_threads_); |
| |
| // Set number of cache line size datastructures. |
| ARG_IVALUE("--cc_inc_count", cc_inc_count_); |
| |
| // Set number of cache line size datastructures |
| ARG_IVALUE("--cc_line_count", cc_cacheline_count_); |
| |
| // Flag set when cache coherency tests need to be run |
| ARG_KVALUE("--cc_test", cc_test_, 1); |
| |
| // Set number of CPU stress threads. |
| ARG_IVALUE("-C", cpu_stress_threads_); |
| |
| // Set logfile name. |
| ARG_SVALUE("-l", logfilename_); |
| |
| // Verbosity level. |
| ARG_IVALUE("-v", verbosity_); |
| |
| // Set maximum number of errors to collect. Stop running after this many. |
| ARG_IVALUE("--max_errors", max_errorcount_); |
| |
| // Set pattern block size. |
| ARG_IVALUE("-p", page_length_); |
| |
| // Set pattern block size. |
| ARG_IVALUE("--filesize", filesize); |
| |
| // NUMA options. |
| ARG_KVALUE("--local_numa", region_mode_, kLocalNuma); |
| ARG_KVALUE("--remote_numa", region_mode_, kRemoteNuma); |
| |
| // Autodetect tempfile locations. |
| ARG_KVALUE("--findfiles", findfiles_, 1); |
| |
| // Inject errors to force miscompare code paths |
| ARG_KVALUE("--force_errors", error_injection_, true); |
| ARG_KVALUE("--force_errors_like_crazy", crazy_error_injection_, true); |
| if (crazy_error_injection_) |
| error_injection_ = true; |
| |
| // Stop immediately on any arror, for debugging HW problems. |
| ARG_KVALUE("--stop_on_errors", stop_on_error_, 1); |
| |
| // Don't use internal error polling, allow external detection. |
| ARG_KVALUE("--no_errors", error_poll_, 0); |
| |
| // Never check data as you go. |
| ARG_KVALUE("-F", strict_, 0); |
| |
| // Warm the cpu as you go. |
| ARG_KVALUE("-W", warm_, 1); |
| |
| // Allow runnign on unknown systems with base unimplemented OsLayer |
| ARG_KVALUE("-A", run_on_anything_, 1); |
| |
| // Size of read blocks for disk test. |
| ARG_IVALUE("--read-block-size", read_block_size_); |
| |
| // Size of write blocks for disk test. |
| ARG_IVALUE("--write-block-size", write_block_size_); |
| |
| // Size of segment for disk test. |
| ARG_IVALUE("--segment-size", segment_size_); |
| |
| // Size of disk cache size for disk test. |
| ARG_IVALUE("--cache-size", cache_size_); |
| |
| // Number of blocks to test per segment. |
| ARG_IVALUE("--blocks-per-segment", blocks_per_segment_); |
| |
| // Maximum time a block read should take before warning. |
| ARG_IVALUE("--read-threshold", read_threshold_); |
| |
| // Maximum time a block write should take before warning. |
| ARG_IVALUE("--write-threshold", write_threshold_); |
| |
| // Do not write anything to disk in the disk test. |
| ARG_KVALUE("--destructive", non_destructive_, 0); |
| |
| // Run SAT in monitor mode. No test load at all. |
| ARG_KVALUE("--monitor_mode", monitor_mode_, true); |
| |
| // Run SAT in address mode. Tag all cachelines by virt addr. |
| ARG_KVALUE("--tag_mode", tag_mode_, true); |
| |
| // Dump range map of tested pages.. |
| ARG_KVALUE("--do_page_map", do_page_map_, true); |
| |
| // Specify the physical address base to test. |
| ARG_IVALUE("--paddr_base", paddr_base_); |
| |
| // Specify the frequency for power spikes. |
| ARG_IVALUE("--pause_delay", pause_delay_); |
| |
| // Specify the duration of each pause (for power spikes). |
| ARG_IVALUE("--pause_duration", pause_duration_); |
| |
| // Disk device names |
| if (!strcmp(argv[i], "-d")) { |
| i++; |
| if (i < argc) { |
| disk_threads_++; |
| diskfilename_.push_back(string(argv[i])); |
| blocktables_.push_back(new DiskBlockTable()); |
| } |
| continue; |
| } |
| |
| // Set number of disk random threads for each disk write thread. |
| ARG_IVALUE("--random-threads", random_threads_); |
| |
| // Set a tempfile to use in a file thread. |
| if (!strcmp(argv[i], "-f")) { |
| i++; |
| if (i < argc) { |
| file_threads_++; |
| filename_.push_back(string(argv[i])); |
| } |
| continue; |
| } |
| |
| // Set a hostname to use in a network thread. |
| if (!strcmp(argv[i], "-n")) { |
| i++; |
| if (i < argc) { |
| net_threads_++; |
| ipaddrs_.push_back(string(argv[i])); |
| } |
| continue; |
| } |
| |
| // Run threads that listen for incoming SAT net connections. |
| ARG_KVALUE("--listen", listen_threads_, 1); |
| |
| if (CheckGoogleSpecificArgs(argc, argv, &i)) { |
| continue; |
| } |
| |
| // Default: |
| PrintVersion(); |
| PrintHelp(); |
| if (strcmp(argv[i], "-h") && strcmp(argv[i], "--help")) { |
| printf("\n Unknown argument %s\n", argv[i]); |
| bad_status(); |
| exit(1); |
| } |
| // Forget it, we printed the help, just bail. |
| // We don't want to print test status, or any log parser stuff. |
| exit(0); |
| } |
| |
| Logger::GlobalLogger()->SetVerbosity(verbosity_); |
| |
| // Update relevant data members with parsed input. |
| // Translate MB into bytes. |
| size_ = static_cast<int64>(size_mb_) * kMegabyte; |
| |
| // Set logfile flag. |
| if (strcmp(logfilename_, "")) |
| use_logfile_ = 1; |
| // Checks valid page length. |
| if (page_length_ && |
| !(page_length_ & (page_length_ - 1)) && |
| (page_length_ > 1023)) { |
| // Prints if we have changed from default. |
| if (page_length_ != kSatPageSize) |
| logprintf(12, "Log: Updating page size to %d\n", page_length_); |
| } else { |
| // Revert to default page length. |
| logprintf(6, "Process Error: " |
| "Invalid page size %d\n", page_length_); |
| page_length_ = kSatPageSize; |
| return false; |
| } |
| |
| // Set disk_pages_ if filesize or page size changed. |
| if (filesize != static_cast<uint64>(page_length_) * |
| static_cast<uint64>(disk_pages_)) { |
| disk_pages_ = filesize / page_length_; |
| if (disk_pages_ == 0) |
| disk_pages_ = 1; |
| } |
| |
| // Print each argument. |
| for (int i = 0; i < argc; i++) { |
| if (i) |
| cmdline_ += " "; |
| cmdline_ += argv[i]; |
| } |
| |
| return true; |
| } |
| |
| void Sat::PrintHelp() { |
| printf("Usage: ./sat(32|64) [options]\n" |
| " -M mbytes megabytes of ram to test\n" |
| " -H mbytes minimum megabytes of hugepages to require\n" |
| " -s seconds number of seconds to run\n" |
| " -m threads number of memory copy threads to run\n" |
| " -i threads number of memory invert threads to run\n" |
| " -C threads number of memory CPU stress threads to run\n" |
| " --findfiles find locations to do disk IO automatically\n" |
| " -d device add a direct write disk thread with block " |
| "device (or file) 'device'\n" |
| " -f filename add a disk thread with " |
| "tempfile 'filename'\n" |
| " -l logfile log output to file 'logfile'\n" |
| " --max_errors n exit early after finding 'n' errors\n" |
| " -v level verbosity (0-20), default is 8\n" |
| " -W Use more CPU-stressful memory copy\n" |
| " -A run in degraded mode on incompatible systems\n" |
| " -p pagesize size in bytes of memory chunks\n" |
| " --filesize size size of disk IO tempfiles\n" |
| " -n ipaddr add a network thread connecting to " |
| "system at 'ipaddr'\n" |
| " --listen run a thread to listen for and respond " |
| "to network threads.\n" |
| " --no_errors run without checking for ECC or other errors\n" |
| " --force_errors inject false errors to test error handling\n" |
| " --force_errors_like_crazy inject a lot of false errors " |
| "to test error handling\n" |
| " -F don't result check each transaction\n" |
| " --stop_on_errors Stop after finding the first error.\n" |
| " --read-block-size size of block for reading (-d)\n" |
| " --write-block-size size of block for writing (-d). If not " |
| "defined, the size of block for writing will be defined as the " |
| "size of block for reading\n" |
| " --segment-size size of segments to split disk into (-d)\n" |
| " --cache-size size of disk cache (-d)\n" |
| " --blocks-per-segment number of blocks to read/write per " |
| "segment per iteration (-d)\n" |
| " --read-threshold maximum time (in us) a block read should " |
| "take (-d)\n" |
| " --write-threshold maximum time (in us) a block write " |
| "should take (-d)\n" |
| " --random-threads number of random threads for each disk " |
| "write thread (-d)\n" |
| " --destructive write/wipe disk partition (-d)\n" |
| " --monitor_mode only do ECC error polling, no stress load.\n" |
| " --cc_test do the cache coherency testing\n" |
| " --cc_inc_count number of times to increment the " |
| "cacheline's member\n" |
| " --cc_line_count number of cache line sized datastructures " |
| "to allocate for the cache coherency threads to operate\n" |
| " --paddr_base allocate memory starting from this address\n" |
| " --pause_delay delay (in seconds) between power spikes\n" |
| " --pause_duration duration (in seconds) of each pause\n" |
| " --local_numa : choose memory regions associated with " |
| "each CPU to be tested by that CPU\n" |
| " --remote_numa : choose memory regions not associated with " |
| "each CPU to be tested by that CPU\n"); |
| } |
| |
| bool Sat::CheckGoogleSpecificArgs(int argc, char **argv, int *i) { |
| // Do nothing, no google-specific argument on public stressapptest |
| return false; |
| } |
| |
| void Sat::GoogleOsOptions(std::map<std::string, std::string> *options) { |
| // Do nothing, no OS-specific argument on public stressapptest |
| } |
| |
| // Launch the SAT task threads. Returns 0 on error. |
| void Sat::InitializeThreads() { |
| // Memory copy threads. |
| AcquireWorkerLock(); |
| |
| logprintf(12, "Log: Starting worker threads\n"); |
| WorkerVector *memory_vector = new WorkerVector(); |
| |
| // Error polling thread. |
| // This may detect ECC corrected errors, disk problems, or |
| // any other errors normally hidden from userspace. |
| WorkerVector *error_vector = new WorkerVector(); |
| if (error_poll_) { |
| ErrorPollThread *thread = new ErrorPollThread(); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &continuous_status_); |
| |
| error_vector->insert(error_vector->end(), thread); |
| } else { |
| logprintf(5, "Log: Skipping error poll thread due to --no_errors flag\n"); |
| } |
| workers_map_.insert(make_pair(kErrorType, error_vector)); |
| |
| // Only start error poll threads for monitor-mode SAT, |
| // skip all other types of worker threads. |
| if (monitor_mode_) { |
| ReleaseWorkerLock(); |
| return; |
| } |
| |
| for (int i = 0; i < memory_threads_; i++) { |
| CopyThread *thread = new CopyThread(); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &power_spike_status_); |
| |
| if ((region_count_ > 1) && (region_mode_)) { |
| int32 region = region_find(i % region_count_); |
| cpu_set_t *cpuset = os_->FindCoreMask(region); |
| sat_assert(cpuset); |
| if (region_mode_ == kLocalNuma) { |
| // Choose regions associated with this CPU. |
| thread->set_cpu_mask(cpuset); |
| thread->set_tag(1 << region); |
| } else if (region_mode_ == kRemoteNuma) { |
| // Choose regions not associated with this CPU.. |
| thread->set_cpu_mask(cpuset); |
| thread->set_tag(region_mask_ & ~(1 << region)); |
| } |
| } else { |
| cpu_set_t available_cpus; |
| thread->AvailableCpus(&available_cpus); |
| int cores = cpuset_count(&available_cpus); |
| // Don't restrict thread location if we have more than one |
| // thread per core. Not so good for performance. |
| if (cpu_stress_threads_ + memory_threads_ <= cores) { |
| // Place a thread on alternating cores first. |
| // This assures interleaved core use with no overlap. |
| int nthcore = i; |
| int nthbit = (((2 * nthcore) % cores) + |
| (((2 * nthcore) / cores) % 2)) % cores; |
| cpu_set_t all_cores; |
| cpuset_set_ab(&all_cores, 0, cores); |
| if (!cpuset_isequal(&available_cpus, &all_cores)) { |
| // We are assuming the bits are contiguous. |
| // Complain if this is not so. |
| logprintf(0, "Log: cores = %s, expected %s\n", |
| cpuset_format(&available_cpus).c_str(), |
| cpuset_format(&all_cores).c_str()); |
| } |
| |
| // Set thread affinity. |
| thread->set_cpu_mask_to_cpu(nthbit); |
| } |
| } |
| memory_vector->insert(memory_vector->end(), thread); |
| } |
| workers_map_.insert(make_pair(kMemoryType, memory_vector)); |
| |
| // File IO threads. |
| WorkerVector *fileio_vector = new WorkerVector(); |
| for (int i = 0; i < file_threads_; i++) { |
| FileThread *thread = new FileThread(); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &power_spike_status_); |
| thread->SetFile(filename_[i].c_str()); |
| // Set disk threads high priority. They don't take much processor time, |
| // but blocking them will delay disk IO. |
| thread->SetPriority(WorkerThread::High); |
| |
| fileio_vector->insert(fileio_vector->end(), thread); |
| } |
| workers_map_.insert(make_pair(kFileIOType, fileio_vector)); |
| |
| // Net IO threads. |
| WorkerVector *netio_vector = new WorkerVector(); |
| WorkerVector *netslave_vector = new WorkerVector(); |
| if (listen_threads_ > 0) { |
| // Create a network slave thread. This listens for connections. |
| NetworkListenThread *thread = new NetworkListenThread(); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &continuous_status_); |
| |
| netslave_vector->insert(netslave_vector->end(), thread); |
| } |
| for (int i = 0; i < net_threads_; i++) { |
| NetworkThread *thread = new NetworkThread(); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &continuous_status_); |
| thread->SetIP(ipaddrs_[i].c_str()); |
| |
| netio_vector->insert(netio_vector->end(), thread); |
| } |
| workers_map_.insert(make_pair(kNetIOType, netio_vector)); |
| workers_map_.insert(make_pair(kNetSlaveType, netslave_vector)); |
| |
| // Result check threads. |
| WorkerVector *check_vector = new WorkerVector(); |
| for (int i = 0; i < check_threads_; i++) { |
| CheckThread *thread = new CheckThread(); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &continuous_status_); |
| |
| check_vector->insert(check_vector->end(), thread); |
| } |
| workers_map_.insert(make_pair(kCheckType, check_vector)); |
| |
| // Memory invert threads. |
| logprintf(12, "Log: Starting invert threads\n"); |
| WorkerVector *invert_vector = new WorkerVector(); |
| for (int i = 0; i < invert_threads_; i++) { |
| InvertThread *thread = new InvertThread(); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &continuous_status_); |
| |
| invert_vector->insert(invert_vector->end(), thread); |
| } |
| workers_map_.insert(make_pair(kInvertType, invert_vector)); |
| |
| // Disk stress threads. |
| WorkerVector *disk_vector = new WorkerVector(); |
| WorkerVector *random_vector = new WorkerVector(); |
| logprintf(12, "Log: Starting disk stress threads\n"); |
| for (int i = 0; i < disk_threads_; i++) { |
| // Creating write threads |
| DiskThread *thread = new DiskThread(blocktables_[i]); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &power_spike_status_); |
| thread->SetDevice(diskfilename_[i].c_str()); |
| if (thread->SetParameters(read_block_size_, write_block_size_, |
| segment_size_, cache_size_, |
| blocks_per_segment_, |
| read_threshold_, write_threshold_, |
| non_destructive_)) { |
| disk_vector->insert(disk_vector->end(), thread); |
| } else { |
| logprintf(12, "Log: DiskThread::SetParameters() failed\n"); |
| delete thread; |
| } |
| |
| for (int j = 0; j < random_threads_; j++) { |
| // Creating random threads |
| RandomDiskThread *rthread = new RandomDiskThread(blocktables_[i]); |
| rthread->InitThread(total_threads_++, this, os_, patternlist_, |
| &power_spike_status_); |
| rthread->SetDevice(diskfilename_[i].c_str()); |
| if (rthread->SetParameters(read_block_size_, write_block_size_, |
| segment_size_, cache_size_, |
| blocks_per_segment_, |
| read_threshold_, write_threshold_, |
| non_destructive_)) { |
| random_vector->insert(random_vector->end(), rthread); |
| } else { |
| logprintf(12, "Log: RandomDiskThread::SetParameters() failed\n"); |
| delete rthread; |
| } |
| } |
| } |
| |
| workers_map_.insert(make_pair(kDiskType, disk_vector)); |
| workers_map_.insert(make_pair(kRandomDiskType, random_vector)); |
| |
| // CPU stress threads. |
| WorkerVector *cpu_vector = new WorkerVector(); |
| logprintf(12, "Log: Starting cpu stress threads\n"); |
| for (int i = 0; i < cpu_stress_threads_; i++) { |
| CpuStressThread *thread = new CpuStressThread(); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &continuous_status_); |
| |
| // Don't restrict thread location if we have more than one |
| // thread per core. Not so good for performance. |
| cpu_set_t available_cpus; |
| thread->AvailableCpus(&available_cpus); |
| int cores = cpuset_count(&available_cpus); |
| if (cpu_stress_threads_ + memory_threads_ <= cores) { |
| // Place a thread on alternating cores first. |
| // Go in reverse order for CPU stress threads. This assures interleaved |
| // core use with no overlap. |
| int nthcore = (cores - 1) - i; |
| int nthbit = (((2 * nthcore) % cores) + |
| (((2 * nthcore) / cores) % 2)) % cores; |
| cpu_set_t all_cores; |
| cpuset_set_ab(&all_cores, 0, cores); |
| if (!cpuset_isequal(&available_cpus, &all_cores)) { |
| logprintf(0, "Log: cores = %s, expected %s\n", |
| cpuset_format(&available_cpus).c_str(), |
| cpuset_format(&all_cores).c_str()); |
| } |
| |
| // Set thread affinity. |
| thread->set_cpu_mask_to_cpu(nthbit); |
| } |
| |
| |
| cpu_vector->insert(cpu_vector->end(), thread); |
| } |
| workers_map_.insert(make_pair(kCPUType, cpu_vector)); |
| |
| // CPU Cache Coherency Threads - one for each core available. |
| if (cc_test_) { |
| WorkerVector *cc_vector = new WorkerVector(); |
| logprintf(12, "Log: Starting cpu cache coherency threads\n"); |
| |
| // Allocate the shared datastructure to be worked on by the threads. |
| cc_cacheline_data_ = reinterpret_cast<cc_cacheline_data*>( |
| malloc(sizeof(cc_cacheline_data) * cc_cacheline_count_)); |
| sat_assert(cc_cacheline_data_ != NULL); |
| |
| // Initialize the strucutre. |
| memset(cc_cacheline_data_, 0, |
| sizeof(cc_cacheline_data) * cc_cacheline_count_); |
| |
| int num_cpus = CpuCount(); |
| // Allocate all the nums once so that we get a single chunk |
| // of contiguous memory. |
| int *num; |
| int err_result = posix_memalign( |
| reinterpret_cast<void**>(&num), |
| kCacheLineSize, sizeof(*num) * num_cpus * cc_cacheline_count_); |
| sat_assert(err_result == 0); |
| |
| int cline; |
| for (cline = 0; cline < cc_cacheline_count_; cline++) { |
| memset(num, 0, sizeof(num_cpus) * num_cpus); |
| cc_cacheline_data_[cline].num = num; |
| num += num_cpus; |
| } |
| |
| int tnum; |
| for (tnum = 0; tnum < num_cpus; tnum++) { |
| CpuCacheCoherencyThread *thread = |
| new CpuCacheCoherencyThread(cc_cacheline_data_, cc_cacheline_count_, |
| tnum, cc_inc_count_); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &continuous_status_); |
| // Pin the thread to a particular core. |
| thread->set_cpu_mask_to_cpu(tnum); |
| |
| // Insert the thread into the vector. |
| cc_vector->insert(cc_vector->end(), thread); |
| } |
| workers_map_.insert(make_pair(kCCType, cc_vector)); |
| } |
| ReleaseWorkerLock(); |
| } |
| |
| // Return the number of cpus actually present in the machine. |
| int Sat::CpuCount() { |
| return sysconf(_SC_NPROCESSORS_CONF); |
| } |
| |
| // Notify and reap worker threads. |
| void Sat::JoinThreads() { |
| logprintf(12, "Log: Joining worker threads\n"); |
| power_spike_status_.StopWorkers(); |
| continuous_status_.StopWorkers(); |
| |
| AcquireWorkerLock(); |
| for (WorkerMap::const_iterator map_it = workers_map_.begin(); |
| map_it != workers_map_.end(); ++map_it) { |
| for (WorkerVector::const_iterator it = map_it->second->begin(); |
| it != map_it->second->end(); ++it) { |
| logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID()); |
| (*it)->JoinThread(); |
| } |
| } |
| ReleaseWorkerLock(); |
| |
| QueueStats(); |
| |
| // Finish up result checking. |
| // Spawn 4 check threads to minimize check time. |
| logprintf(12, "Log: Finished countdown, begin to result check\n"); |
| WorkerStatus reap_check_status; |
| WorkerVector reap_check_vector; |
| |
| // No need for check threads for monitor mode. |
| if (!monitor_mode_) { |
| // Initialize the check threads. |
| for (int i = 0; i < fill_threads_; i++) { |
| CheckThread *thread = new CheckThread(); |
| thread->InitThread(total_threads_++, this, os_, patternlist_, |
| &reap_check_status); |
| logprintf(12, "Log: Finished countdown, begin to result check\n"); |
| reap_check_vector.push_back(thread); |
| } |
| } |
| |
| reap_check_status.Initialize(); |
| // Check threads should be marked to stop ASAP. |
| reap_check_status.StopWorkers(); |
| |
| // Spawn the check threads. |
| for (WorkerVector::const_iterator it = reap_check_vector.begin(); |
| it != reap_check_vector.end(); ++it) { |
| logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID()); |
| (*it)->SpawnThread(); |
| } |
| |
| // Join the check threads. |
| for (WorkerVector::const_iterator it = reap_check_vector.begin(); |
| it != reap_check_vector.end(); ++it) { |
| logprintf(12, "Log: Joining thread %d\n", (*it)->ThreadID()); |
| (*it)->JoinThread(); |
| } |
| |
| // Reap all children. Stopped threads should have already ended. |
| // Result checking threads will end when they have finished |
| // result checking. |
| logprintf(12, "Log: Join all outstanding threads\n"); |
| |
| // Find all errors. |
| errorcount_ = GetTotalErrorCount(); |
| |
| AcquireWorkerLock(); |
| for (WorkerMap::const_iterator map_it = workers_map_.begin(); |
| map_it != workers_map_.end(); ++map_it) { |
| for (WorkerVector::const_iterator it = map_it->second->begin(); |
| it != map_it->second->end(); ++it) { |
| logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID()); |
| if ((*it)->GetStatus() != 1) { |
| logprintf(0, "Process Error: Thread %d failed with status %d at " |
| "%.2f seconds\n", |
| (*it)->ThreadID(), (*it)->GetStatus(), |
| (*it)->GetRunDurationUSec()*1.0/1000000); |
| bad_status(); |
| } |
| int priority = 12; |
| if ((*it)->GetErrorCount()) |
| priority = 5; |
| logprintf(priority, "Log: Thread %d found %lld hardware incidents\n", |
| (*it)->ThreadID(), (*it)->GetErrorCount()); |
| } |
| } |
| ReleaseWorkerLock(); |
| |
| |
| // Add in any errors from check threads. |
| for (WorkerVector::const_iterator it = reap_check_vector.begin(); |
| it != reap_check_vector.end(); ++it) { |
| logprintf(12, "Log: Reaping thread status %d\n", (*it)->ThreadID()); |
| if ((*it)->GetStatus() != 1) { |
| logprintf(0, "Process Error: Thread %d failed with status %d at " |
| "%.2f seconds\n", |
| (*it)->ThreadID(), (*it)->GetStatus(), |
| (*it)->GetRunDurationUSec()*1.0/1000000); |
| bad_status(); |
| } |
| errorcount_ += (*it)->GetErrorCount(); |
| int priority = 12; |
| if ((*it)->GetErrorCount()) |
| priority = 5; |
| logprintf(priority, "Log: Thread %d found %lld hardware incidents\n", |
| (*it)->ThreadID(), (*it)->GetErrorCount()); |
| delete (*it); |
| } |
| reap_check_vector.clear(); |
| reap_check_status.Destroy(); |
| } |
| |
| // Print queuing information. |
| void Sat::QueueStats() { |
| finelock_q_->QueueAnalysis(); |
| } |
| |
| void Sat::AnalysisAllStats() { |
| float max_runtime_sec = 0.; |
| float total_data = 0.; |
| float total_bandwidth = 0.; |
| float thread_runtime_sec = 0.; |
| |
| for (WorkerMap::const_iterator map_it = workers_map_.begin(); |
| map_it != workers_map_.end(); ++map_it) { |
| for (WorkerVector::const_iterator it = map_it->second->begin(); |
| it != map_it->second->end(); ++it) { |
| thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000; |
| total_data += (*it)->GetMemoryCopiedData(); |
| total_data += (*it)->GetDeviceCopiedData(); |
| if (thread_runtime_sec > max_runtime_sec) { |
| max_runtime_sec = thread_runtime_sec; |
| } |
| } |
| } |
| |
| total_bandwidth = total_data / max_runtime_sec; |
| |
| logprintf(0, "Stats: Completed: %.2fM in %.2fs %.2fMB/s, " |
| "with %d hardware incidents, %d errors\n", |
| total_data, |
| max_runtime_sec, |
| total_bandwidth, |
| errorcount_, |
| statuscount_); |
| } |
| |
| void Sat::MemoryStats() { |
| float memcopy_data = 0.; |
| float memcopy_bandwidth = 0.; |
| WorkerMap::const_iterator mem_it = workers_map_.find( |
| static_cast<int>(kMemoryType)); |
| WorkerMap::const_iterator file_it = workers_map_.find( |
| static_cast<int>(kFileIOType)); |
| sat_assert(mem_it != workers_map_.end()); |
| sat_assert(file_it != workers_map_.end()); |
| for (WorkerVector::const_iterator it = mem_it->second->begin(); |
| it != mem_it->second->end(); ++it) { |
| memcopy_data += (*it)->GetMemoryCopiedData(); |
| memcopy_bandwidth += (*it)->GetMemoryBandwidth(); |
| } |
| for (WorkerVector::const_iterator it = file_it->second->begin(); |
| it != file_it->second->end(); ++it) { |
| memcopy_data += (*it)->GetMemoryCopiedData(); |
| memcopy_bandwidth += (*it)->GetMemoryBandwidth(); |
| } |
| GoogleMemoryStats(&memcopy_data, &memcopy_bandwidth); |
| logprintf(4, "Stats: Memory Copy: %.2fM at %.2fMB/s\n", |
| memcopy_data, |
| memcopy_bandwidth); |
| } |
| |
| void Sat::GoogleMemoryStats(float *memcopy_data, |
| float *memcopy_bandwidth) { |
| // Do nothing, should be implemented by subclasses. |
| } |
| |
| void Sat::FileStats() { |
| float file_data = 0.; |
| float file_bandwidth = 0.; |
| WorkerMap::const_iterator file_it = workers_map_.find( |
| static_cast<int>(kFileIOType)); |
| sat_assert(file_it != workers_map_.end()); |
| for (WorkerVector::const_iterator it = file_it->second->begin(); |
| it != file_it->second->end(); ++it) { |
| file_data += (*it)->GetDeviceCopiedData(); |
| file_bandwidth += (*it)->GetDeviceBandwidth(); |
| } |
| logprintf(4, "Stats: File Copy: %.2fM at %.2fMB/s\n", |
| file_data, |
| file_bandwidth); |
| } |
| |
| void Sat::CheckStats() { |
| float check_data = 0.; |
| float check_bandwidth = 0.; |
| WorkerMap::const_iterator check_it = workers_map_.find( |
| static_cast<int>(kCheckType)); |
| sat_assert(check_it != workers_map_.end()); |
| for (WorkerVector::const_iterator it = check_it->second->begin(); |
| it != check_it->second->end(); ++it) { |
| check_data += (*it)->GetMemoryCopiedData(); |
| check_bandwidth += (*it)->GetMemoryBandwidth(); |
| } |
| logprintf(4, "Stats: Data Check: %.2fM at %.2fMB/s\n", |
| check_data, |
| check_bandwidth); |
| } |
| |
| void Sat::NetStats() { |
| float net_data = 0.; |
| float net_bandwidth = 0.; |
| WorkerMap::const_iterator netio_it = workers_map_.find( |
| static_cast<int>(kNetIOType)); |
| WorkerMap::const_iterator netslave_it = workers_map_.find( |
| static_cast<int>(kNetSlaveType)); |
| sat_assert(netio_it != workers_map_.end()); |
| sat_assert(netslave_it != workers_map_.end()); |
| for (WorkerVector::const_iterator it = netio_it->second->begin(); |
| it != netio_it->second->end(); ++it) { |
| net_data += (*it)->GetDeviceCopiedData(); |
| net_bandwidth += (*it)->GetDeviceBandwidth(); |
| } |
| for (WorkerVector::const_iterator it = netslave_it->second->begin(); |
| it != netslave_it->second->end(); ++it) { |
| net_data += (*it)->GetDeviceCopiedData(); |
| net_bandwidth += (*it)->GetDeviceBandwidth(); |
| } |
| logprintf(4, "Stats: Net Copy: %.2fM at %.2fMB/s\n", |
| net_data, |
| net_bandwidth); |
| } |
| |
| void Sat::InvertStats() { |
| float invert_data = 0.; |
| float invert_bandwidth = 0.; |
| WorkerMap::const_iterator invert_it = workers_map_.find( |
| static_cast<int>(kInvertType)); |
| sat_assert(invert_it != workers_map_.end()); |
| for (WorkerVector::const_iterator it = invert_it->second->begin(); |
| it != invert_it->second->end(); ++it) { |
| invert_data += (*it)->GetMemoryCopiedData(); |
| invert_bandwidth += (*it)->GetMemoryBandwidth(); |
| } |
| logprintf(4, "Stats: Invert Data: %.2fM at %.2fMB/s\n", |
| invert_data, |
| invert_bandwidth); |
| } |
| |
| void Sat::DiskStats() { |
| float disk_data = 0.; |
| float disk_bandwidth = 0.; |
| WorkerMap::const_iterator disk_it = workers_map_.find( |
| static_cast<int>(kDiskType)); |
| WorkerMap::const_iterator random_it = workers_map_.find( |
| static_cast<int>(kRandomDiskType)); |
| sat_assert(disk_it != workers_map_.end()); |
| sat_assert(random_it != workers_map_.end()); |
| for (WorkerVector::const_iterator it = disk_it->second->begin(); |
| it != disk_it->second->end(); ++it) { |
| disk_data += (*it)->GetDeviceCopiedData(); |
| disk_bandwidth += (*it)->GetDeviceBandwidth(); |
| } |
| for (WorkerVector::const_iterator it = random_it->second->begin(); |
| it != random_it->second->end(); ++it) { |
| disk_data += (*it)->GetDeviceCopiedData(); |
| disk_bandwidth += (*it)->GetDeviceBandwidth(); |
| } |
| |
| logprintf(4, "Stats: Disk: %.2fM at %.2fMB/s\n", |
| disk_data, |
| disk_bandwidth); |
| } |
| |
| // Process worker thread data for bandwidth information, and error results. |
| // You can add more methods here just subclassing SAT. |
| void Sat::RunAnalysis() { |
| AnalysisAllStats(); |
| MemoryStats(); |
| FileStats(); |
| NetStats(); |
| CheckStats(); |
| InvertStats(); |
| DiskStats(); |
| } |
| |
| // Get total error count, summing across all threads.. |
| int64 Sat::GetTotalErrorCount() { |
| int64 errors = 0; |
| |
| AcquireWorkerLock(); |
| for (WorkerMap::const_iterator map_it = workers_map_.begin(); |
| map_it != workers_map_.end(); ++map_it) { |
| for (WorkerVector::const_iterator it = map_it->second->begin(); |
| it != map_it->second->end(); ++it) { |
| errors += (*it)->GetErrorCount(); |
| } |
| } |
| ReleaseWorkerLock(); |
| return errors; |
| } |
| |
| |
| void Sat::SpawnThreads() { |
| logprintf(12, "Log: Initializing WorkerStatus objects\n"); |
| power_spike_status_.Initialize(); |
| continuous_status_.Initialize(); |
| logprintf(12, "Log: Spawning worker threads\n"); |
| for (WorkerMap::const_iterator map_it = workers_map_.begin(); |
| map_it != workers_map_.end(); ++map_it) { |
| for (WorkerVector::const_iterator it = map_it->second->begin(); |
| it != map_it->second->end(); ++it) { |
| logprintf(12, "Log: Spawning thread %d\n", (*it)->ThreadID()); |
| (*it)->SpawnThread(); |
| } |
| } |
| } |
| |
| // Delete used worker thread objects. |
| void Sat::DeleteThreads() { |
| logprintf(12, "Log: Deleting worker threads\n"); |
| for (WorkerMap::const_iterator map_it = workers_map_.begin(); |
| map_it != workers_map_.end(); ++map_it) { |
| for (WorkerVector::const_iterator it = map_it->second->begin(); |
| it != map_it->second->end(); ++it) { |
| logprintf(12, "Log: Deleting thread %d\n", (*it)->ThreadID()); |
| delete (*it); |
| } |
| delete map_it->second; |
| } |
| workers_map_.clear(); |
| logprintf(12, "Log: Destroying WorkerStatus objects\n"); |
| power_spike_status_.Destroy(); |
| continuous_status_.Destroy(); |
| } |
| |
| namespace { |
| // Calculates the next time an action in Sat::Run() should occur, based on a |
| // schedule derived from a start point and a regular frequency. |
| // |
| // Using frequencies instead of intervals with their accompanying drift allows |
| // users to better predict when the actions will occur throughout a run. |
| // |
| // Arguments: |
| // frequency: seconds |
| // start: unixtime |
| // now: unixtime |
| // |
| // Returns: unixtime |
| inline time_t NextOccurance(time_t frequency, time_t start, time_t now) { |
| return start + frequency + (((now - start) / frequency) * frequency); |
| } |
| } |
| |
| // Run the actual test. |
| bool Sat::Run() { |
| // Install signal handlers to gracefully exit in the middle of a run. |
| // |
| // Why go through this whole rigmarole? It's the only standards-compliant |
| // (C++ and POSIX) way to handle signals in a multithreaded program. |
| // Specifically: |
| // |
| // 1) (C++) The value of a variable not of type "volatile sig_atomic_t" is |
| // unspecified upon entering a signal handler and, if modified by the |
| // handler, is unspecified after leaving the handler. |
| // |
| // 2) (POSIX) After the value of a variable is changed in one thread, another |
| // thread is only guaranteed to see the new value after both threads have |
| // acquired or released the same mutex or rwlock, synchronized to the |
| // same barrier, or similar. |
| // |
| // #1 prevents the use of #2 in a signal handler, so the signal handler must |
| // be called in the same thread that reads the "volatile sig_atomic_t" |
| // variable it sets. We enforce that by blocking the signals in question in |
| // the worker threads, forcing them to be handled by this thread. |
| logprintf(12, "Log: Installing signal handlers\n"); |
| sigset_t new_blocked_signals; |
| sigemptyset(&new_blocked_signals); |
| sigaddset(&new_blocked_signals, SIGINT); |
| sigaddset(&new_blocked_signals, SIGTERM); |
| sigset_t prev_blocked_signals; |
| pthread_sigmask(SIG_BLOCK, &new_blocked_signals, &prev_blocked_signals); |
| sighandler_t prev_sigint_handler = signal(SIGINT, SatHandleBreak); |
| sighandler_t prev_sigterm_handler = signal(SIGTERM, SatHandleBreak); |
| |
| // Kick off all the worker threads. |
| logprintf(12, "Log: Launching worker threads\n"); |
| InitializeThreads(); |
| SpawnThreads(); |
| pthread_sigmask(SIG_SETMASK, &prev_blocked_signals, NULL); |
| |
| logprintf(12, "Log: Starting countdown with %d seconds\n", runtime_seconds_); |
| |
| // In seconds. |
| static const time_t kSleepFrequency = 5; |
| // All of these are in seconds. You probably want them to be >= |
| // kSleepFrequency and multiples of kSleepFrequency, but neither is necessary. |
| static const time_t kInjectionFrequency = 10; |
| static const time_t kPrintFrequency = 10; |
| |
| const time_t start = time(NULL); |
| const time_t end = start + runtime_seconds_; |
| time_t now = start; |
| time_t next_print = start + kPrintFrequency; |
| time_t next_pause = start + pause_delay_; |
| time_t next_resume = 0; |
| time_t next_injection; |
| if (crazy_error_injection_) { |
| next_injection = start + kInjectionFrequency; |
| } else { |
| next_injection = 0; |
| } |
| |
| while (now < end) { |
| // This is an int because it's for logprintf(). |
| const int seconds_remaining = end - now; |
| |
| if (user_break_) { |
| // Handle early exit. |
| logprintf(0, "Log: User exiting early (%d seconds remaining)\n", |
| seconds_remaining); |
| break; |
| } |
| |
| // If we have an error limit, check it here and see if we should exit. |
| if (max_errorcount_ != 0) { |
| uint64 errors = GetTotalErrorCount(); |
| if (errors > max_errorcount_) { |
| logprintf(0, "Log: Exiting early (%d seconds remaining) " |
| "due to excessive failures (%lld)\n", |
| seconds_remaining, |
| errors); |
| break; |
| } |
| } |
| |
| if (now >= next_print) { |
| // Print a count down message. |
| logprintf(5, "Log: Seconds remaining: %d\n", seconds_remaining); |
| next_print = NextOccurance(kPrintFrequency, start, now); |
| } |
| |
| if (next_injection && now >= next_injection) { |
| // Inject an error. |
| logprintf(4, "Log: Injecting error (%d seconds remaining)\n", |
| seconds_remaining); |
| struct page_entry src; |
| GetValid(&src); |
| src.pattern = patternlist_->GetPattern(0); |
| PutValid(&src); |
| next_injection = NextOccurance(kInjectionFrequency, start, now); |
| } |
| |
| if (next_pause && now >= next_pause) { |
| // Tell worker threads to pause in preparation for a power spike. |
| logprintf(4, "Log: Pausing worker threads in preparation for power spike " |
| "(%d seconds remaining)\n", seconds_remaining); |
| power_spike_status_.PauseWorkers(); |
| logprintf(12, "Log: Worker threads paused\n"); |
| next_pause = 0; |
| next_resume = now + pause_duration_; |
| } |
| |
| if (next_resume && now >= next_resume) { |
| // Tell worker threads to resume in order to cause a power spike. |
| logprintf(4, "Log: Resuming worker threads to cause a power spike (%d " |
| "seconds remaining)\n", seconds_remaining); |
| power_spike_status_.ResumeWorkers(); |
| logprintf(12, "Log: Worker threads resumed\n"); |
| next_pause = NextOccurance(pause_delay_, start, now); |
| next_resume = 0; |
| } |
| |
| sat_sleep(NextOccurance(kSleepFrequency, start, now) - now); |
| now = time(NULL); |
| } |
| |
| JoinThreads(); |
| |
| logprintf(0, "Stats: Found %lld hardware incidents\n", errorcount_); |
| |
| if (!monitor_mode_) |
| RunAnalysis(); |
| |
| DeleteThreads(); |
| |
| logprintf(12, "Log: Uninstalling signal handlers\n"); |
| signal(SIGINT, prev_sigint_handler); |
| signal(SIGTERM, prev_sigterm_handler); |
| |
| return true; |
| } |
| |
| // Clean up all resources. |
| bool Sat::Cleanup() { |
| g_sat = NULL; |
| Logger::GlobalLogger()->StopThread(); |
| Logger::GlobalLogger()->SetStdoutOnly(); |
| if (logfile_) { |
| close(logfile_); |
| logfile_ = 0; |
| } |
| if (patternlist_) { |
| patternlist_->Destroy(); |
| delete patternlist_; |
| patternlist_ = 0; |
| } |
| if (os_) { |
| os_->FreeTestMem(); |
| delete os_; |
| os_ = 0; |
| } |
| if (empty_) { |
| delete empty_; |
| empty_ = 0; |
| } |
| if (valid_) { |
| delete valid_; |
| valid_ = 0; |
| } |
| if (finelock_q_) { |
| delete finelock_q_; |
| finelock_q_ = 0; |
| } |
| if (page_bitmap_) { |
| delete[] page_bitmap_; |
| } |
| |
| for (size_t i = 0; i < blocktables_.size(); i++) { |
| delete blocktables_[i]; |
| } |
| |
| if (cc_cacheline_data_) { |
| // The num integer arrays for all the cacheline structures are |
| // allocated as a single chunk. The pointers in the cacheline struct |
| // are populated accordingly. Hence calling free on the first |
| // cacheline's num's address is going to free the entire array. |
| // TODO(aganti): Refactor this to have a class for the cacheline |
| // structure (currently defined in worker.h) and clean this up |
| // in the destructor of that class. |
| if (cc_cacheline_data_[0].num) { |
| free(cc_cacheline_data_[0].num); |
| } |
| free(cc_cacheline_data_); |
| } |
| |
| sat_assert(0 == pthread_mutex_destroy(&worker_lock_)); |
| |
| return true; |
| } |
| |
| |
| // Pretty print really obvious results. |
| bool Sat::PrintResults() { |
| bool result = true; |
| |
| logprintf(4, "\n"); |
| if (statuscount_) { |
| logprintf(4, "Status: FAIL - test encountered procedural errors\n"); |
| result = false; |
| } else if (errorcount_) { |
| logprintf(4, "Status: FAIL - test discovered HW problems\n"); |
| result = false; |
| } else { |
| logprintf(4, "Status: PASS - please verify no corrected errors\n"); |
| } |
| logprintf(4, "\n"); |
| |
| return result; |
| } |
| |
| // Helper functions. |
| void Sat::AcquireWorkerLock() { |
| sat_assert(0 == pthread_mutex_lock(&worker_lock_)); |
| } |
| void Sat::ReleaseWorkerLock() { |
| sat_assert(0 == pthread_mutex_unlock(&worker_lock_)); |
| } |
| |
| void logprintf(int priority, const char *format, ...) { |
| va_list args; |
| va_start(args, format); |
| Logger::GlobalLogger()->VLogF(priority, format, args); |
| va_end(args); |
| } |