BibTeX Citations
June 16th, 2013
@article{engelmann13scaling, author = "Christian Engelmann", title = "Scaling To A Million Cores And Beyond: {Using} Light-Weight Simulation to Understand The Challenges Ahead On The Road To Exascale", journal = "\href{http://www.elsevier.com/locate/fgcs}{Future Generation Computer Systems (FGCS)}", volume = "", number = "", pages = "", month = "", year = "2013", publisher = "\href{http://www.elsevier.com}{Elsevier B.V, Amsterdam, The Netherlands}", issn = "", doi = "", url = "", abstract = "As supercomputers scale to 1,000 PFlop/s over the next decade, investigating the performance of parallel applications at scale on future architectures and the performance impact of different architecture choices for high-performance computing (HPC) hardware/software co-design is crucial. This paper summarizes recent efforts in designing and implementing a novel HPC hardware/software co-design toolkit. The presented Extreme-scale Simulator (xSim) permits running an HPC application in a controlled environment with millions of concurrent execution threads while observing its performance in a simulated extreme-scale HPC system using architectural models and virtual timing. This paper demonstrates the capabilities and usefulness of the xSim performance investigation toolkit, such as its scalability to $2^{27}$ simulated Message Passing Interface (MPI) ranks on 960 real processor cores, the capability to evaluate the performance of different MPI collective communication algorithms, and the ability to evaluate the performance of a basic Monte Carlo application with different architectural parameters.", note = "To appear" }
@article{wang12proactive, author = "Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Proactive Process-Level Live Migration and Back Migration in {HPC} Environments", journal = "\href{http://www.elsevier.com/locate/jpdc}{Journal of Parallel and Distributed Computing (JPDC)}", volume = "72", number = "2", pages = "254--267", month = feb, year = "2012", publisher = "\href{http://www.elsevier.com}{Elsevier B.V, Amsterdam, The Netherlands}", issn = "0743-7315", doi = "http://dx.doi.org/10.1016/j.jpdc.2011.10.009", url = "http://www.christian-engelmann.info/publications/wang12proactive.pdf", abstract = "As the number of nodes in high-performance computing environments keeps increasing, faults are becoming common place. Reactive fault tolerance (FT) often does not scale due to massive I/O requirements and relies on manual job resubmission. This work complements reactive with proactive FT at the process level. Through health monitoring, a subset of node failures can be anticipated when one's health deteriorates. A novel process-level live migration mechanism supports continued execution of applications during much of process migration. This scheme is integrated into an MPI execution environment to transparently sustain health-inflicted node failures, which eradicates the need to restart and requeue MPI jobs. Experiments indicate that 1-6.5 s of prior warning are required to successfully trigger live process migration while similar operating system virtualization mechanisms require 13-24 s. This self-healing approach complements reactive FT by nearly cutting the number of checkpoints in half when 70\% of the faults are handled proactively. The work also provides a novel back migration approach to eliminate load imbalance or bottlenecks caused by migrated tasks. Experiments indicate the larger the amount of outstanding execution, the higher the benefit due to back migration.", }
@article{scott10system, author = "Stephen L. Scott and Geoffroy R. Vall\'ee and Thomas Naughton and Anand Tikotekar and Christian Engelmann and Hong H. Ong", title = "System-Level Virtualization Research at {Oak Ridge National Laboratory}", journal = "\href{http://www.elsevier.com/locate/fgcs}{Future Generation Computer Systems (FGCS)}", volume = "26", number = "3", pages = "304--307", month = mar, year = "2010", publisher = "\href{http://www.elsevier.com}{Elsevier B.V, Amsterdam, The Netherlands}", issn = "0167-739X", doi = "http://dx.doi.org/10.1016/j.future.2009.07.001", url = "http://www.christian-engelmann.info/publications/scott09system.pdf", abstract = "System-level virtualization is today enjoying a rebirth as a technique to effectively share what were then considered large computing resources to subsequently fade from the spotlight as individual workstations gained in popularity with a one machine -- one user approach. One reason for this resurgence is that the simple workstation has grown in capability to rival that of anything available in the past. Thus, computing centers are again looking at the price/performance benefit of sharing that single computing box via server consolidation. However, industry is only concentrating on the benefits of using virtualization for server consolidation (enterprise computing) whereas our interest is in leveraging virtualization to advance high-performance computing (HPC). While these two interests may appear to be orthogonal, one consolidating multiple applications and users on a single machine while the other requires all the power from many machines to be dedicated solely to its purpose, we propose that virtualization does provide attractive capabilities that may be exploited to the benefit of HPC interests. This does raise the two fundamental questions of: is the concept of virtualization (a machine sharing technology) really suitable for HPC and if so, how does one go about leveraging these virtualization capabilities for the benefit of HPC. To address these questions, this document presents ongoing studies on the usage of system-level virtualization in a HPC context. These studies include an analysis of the benefits of system-level virtualization for HPC, a presentation of research efforts based on virtualization for system availability, and a presentation of research efforts for the management of virtual systems. The basis for this document was material presented by Stephen L. Scott at the Collaborative and Grid Computing Technologies meeting held in Cancun, Mexico on April 12-14, 2007." }
@article{he09symmetric, author = "Xubin (Ben) He and Li Ou and Christian Engelmann and Xin Chen and Stephen L. Scott", title = "Symmetric Active/Active Metadata Service for High Availability Parallel File Systems", journal = "\href{http://www.elsevier.com/locate/jpdc}{Journal of Parallel and Distributed Computing (JPDC)}", volume = "69", number = "12", pages = "961-973", month = dec, year = "2009", publisher = "\href{http://www.elsevier.com}{Elsevier B.V, Amsterdam, The Netherlands}", issn = "0743-7315", doi = "http://dx.doi.org/10.1016/j.jpdc.2009.08.004", url = "http://www.christian-engelmann.info/publications/he09symmetric.pdf", abstract = "High availability data storage systems are critical for many applications as research and business become more data-driven. Since metadata management is essential to system availability, multiple metadata services are used to improve the availability of distributed storage systems. Past research focused on the active/standby model, where each active service has at least one redundant idle backup. However, interruption of service and even some loss of service state may occur during a fail-over depending on the used replication technique. In addition, the replication overhead for multiple metadata services can be very high. The research in this paper targets the symmetric active/active replication model, which uses multiple redundant service nodes running in virtual synchrony. In this model, service node failures do not cause a fail-over to a backup and there is no disruption of service or loss of service state. We further discuss a fast delivery protocol to reduce the latency of the needed total order broadcast. Our prototype implementation shows that metadata service high availability can be achieved with an acceptable performance trade-off using our symmetric active/active metadata service solution." }
@article{he07unified, author = "Xubin (Ben) He and Li Ou and Martha J. Kosa and Stephen L. Scott and Christian Engelmann", title = "A Unified Multiple-Level Cache for High Performance Cluster Storage Systems", journal = "\href{http://www.inderscience.com/browse/index.php?journalcode=ijhpcn} {International Journal of High Performance Computing and Networking (IJHPCN)}", volume = "5", number = "1-2", pages = "97--109", year = "2007", publisher = "\href{http://www.inderscience.com}{Inderscience Publishers, Geneve, Switzerland}", issn = "1740-0562", doi = "http://dx.doi.org/10.1504/IJHPCN.2007.015768", url = "http://www.christian-engelmann.info/publications/he07unified.pdf", abstract = "Highly available data storage for high-performance computing is becoming increasingly more critical as high-end computing systems scale up in size and storage systems are developed around network-centered architectures. A promising solution is to harness the collective storage potential of individual workstations much as we harness idle CPU cycles due to the excellent price/performance ratio and low storage usage of most commodity workstations. For such a storage system, metadata consistency is a key issue assuring storage system availability as well as data reliability. In this paper, we present a decentralized metadata management scheme that improves storage availability without sacrificing performance." }
@article{engelmann06symmetric, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Symmetric Active/Active High Availability for High-Performance Computing System Services", journal = "\href{http://www.academypublisher.com/jcp}{Journal of Computers (JCP)}", volume = "1", number = "8", pages = "43--54", month = dec, year = "2006", publisher = "\href{http://www.academypublisher.com}{Academy Publisher, Oulu, Finland}", issn = "1796-203X", doi = "http://www.academypublisher.com/jcp/vol01/no08/jcp01084354.html", url = "http://www.christian-engelmann.info/publications/engelmann06symmetric.pdf", abstract = "This work aims to pave the way for high availability in high-performance computing (HPC) by focusing on efficient redundancy strategies for head and service nodes. These nodes represent single points of failure and control for an entire HPC system as they render it inaccessible and unmanageable in case of a failure until repair. The presented approach introduces two distinct replication methods, internal and external, for providing symmetric active/active high availability for multiple redundant head and service nodes running in virtual synchrony utilizing an existing process group communication system for service group membership management and reliable, totally ordered message delivery. Resented results of a prototype implementation that offers symmetric active/active replication for HPC job and resource management using external replication show that the highest level of availability can be provided with an acceptable performance trade-off." }
@article{engelmann06molar, author = "Christian Engelmann and Stephen L. Scott and David E. Bernholdt and Narasimha R. Gottumukkala and Chokchai (Box) Leangsuksun and Jyothish Varma and Chao Wang and Frank Mueller and Aniruddha G. Shet and Ponnuswamy (Saday) Sadayappan", title = "{MOLAR}: {A}daptive Runtime Support for High-End Computing Operating and Runtime Systems", journal = "\href{http://www.sigops.org/osr.html}{ACM SIGOPS Operating Systems Review (OSR)}", volume = "40", number = "2", pages = "63--72", month = apr, year = "2006", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", issn = "0163-5980", doi = "http://doi.acm.org/10.1145/1131322.1131337", url = "http://www.christian-engelmann.info/publications/engelmann06molar.pdf", abstract = "MOLAR is a multi-institutional research effort that concentrates on adaptive, reliable, and efficient operating and runtime system (OS/R) solutions for ultra-scale, high-end scientific computing on the next generation of supercomputers. This research addresses the challenges outlined in FAST-OS (forum to address scalable technology for runtime and operating systems) and HECRTF (high-end computing revitalization task force) activities by exploring the use of advanced monitoring and adaptation to improve application performance and predictability of system interruptions, and by advancing computer reliability, availability and serviceability (RAS) management systems to work cooperatively with the OS/R to identify and preemptively resolve system issues. This paper describes recent research of the MOLAR team in advancing RAS for high-end computing OS/Rs." }
@conference{engelmann13investigating, author = "Christian Engelmann", title = "Investigating Operating System Noise in Extreme-Scale High-Performance Computing Systems using Simulation", booktitle = "Proceedings of the \href{http://www.iasted.org/conferences/home-795.html} {$11^{th}$ IASTED International Conference on Parallel and Distributed Computing and Networks (PDCN) 2013}", pages = "", month = feb # "~11-13, ", year = "2013", address = "Innsbruck, Austria", publisher = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB, Canada}", isbn = "978-0-88986-943-1", doi = "", url = "http://www.christian-engelmann.info/publications/engelmann12investigating.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann12investigating.ppt.pdf", abstract = "Hardware/software co-design for future-generation high-performance computing (HPC) systems aims at closing the gap between the peak capabilities of the hardware and the performance realized by applications (application-architecture performance gap). Performance profiling of architectures and applications is a crucial part of this iterative process. The work in this paper focuses on operating system (OS) noise as an additional factor to be considered for co-design. It represents the first step in including OS noise in HPC hardware/software co-design by adding a noise injection feature to an existing simulation-based co-design toolkit. It reuses an existing abstraction for OS noise with frequency (periodic recurrence) and period (duration of each occurrence) to enhance the processor model of the Extreme-scale Simulator (xSim) with synchronized and random OS noise simulation. The results demonstrate this capability by evaluating the impact of OS noise on MPI\_Bcast() and MPI\_Reduce() in a simulated future-generation HPC system with 2,097,152 compute nodes." }
@conference{fiala12detection2, author = "David Fiala and Frank Mueller and Christian Engelmann and Kurt Ferreira and Ron Brightwell and Rolf Riesen", title = "Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing", booktitle = "Proceedings of the \href{http://sc12.supercomputing.org}{$25^{th}$ IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2012}", pages = "78:1--78:12", month = nov # "~10-16, ", year = "2012", address = "Salt Lake City, UT, USA", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", isbn = "978-1-4673-0804-5", url = "http://www.christian-engelmann.info/publications/fiala12detection2.pdf", url2 = "http://www.christian-engelmann.info/publications/fiala12detection2.ppt.pdf", abstract = "Faults have become the norm rather than the exception for high-end computing on clusters with 10s/100s of thousands of cores. Exacerbating this situation, some of these faults remain undetected, manifesting themselves as silent errors that corrupt memory while applications continue to operate and report incorrect results. This paper studies the potential for redundancy to both detect and correct soft errors in MPI message-passing applications. Our study investigates the challenges inherent to detecting soft errors within MPI application while providing transparent MPI redundancy. By assuming a model wherein corruption in application data manifests itself by producing differing MPI message data between replicas, we study the best suited protocols for detecting and correcting MPI data that is the result of corruption. To experimentally validate our proposed detection and correction protocols, we introduce RedMPI, an MPI library which resides in the MPI profiling layer. RedMPI is capable of both online detection and correction of soft errors that occur in MPI applications without requiring any modifications to the application source by utilizing either double or triple redundancy. Our results indicate that our most efficient consistency protocol can successfully protect applications experiencing even high rates of silent data corruption with runtime overheads between 0\% and 30\% as compared to unprotected applications without redundancy. Using our fault injector within RedMPI, we observe that even a single soft error can have profound effects on running applications, causing a cascading pattern of corruption in most cases causes that spreads to all other processes. RedMPI's protection has been shown to successfully mitigate the effects of soft errors while allowing applications to complete with correct results even in the face of errors.", note = "Acceptance rate 21.2\% (100/472)" }
@conference{elliott12combining, author = "James Elliott and Kishor Kharbas and David Fiala and Frank Mueller and Kurt Ferreira and Christian Engelmann", title = "Combining Partial Redundancy and Checkpointing for {HPC}", booktitle = "Proceedings of the \href{http://icdcs-2012.org/} {$32^{nd}$ International Conference on Distributed Computing Systems (ICDCS) 2012}", pages = "615-626", month = jun # "~18-21, ", year = "2012", address = "Macau, SAR, China", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-4685-8", issn = "1063-6927", doi = "http://dx.doi.org/10.1109/ICDCS.2012.56", url = "http://www.christian-engelmann.info/publications/elliott12combining.pdf", url2 = "http://www.christian-engelmann.info/publications/elliott12combining.ppt.pdf", abstract = "Today's largest High Performance Computing (HPC) systems exceed one Petaflops (10^15 floating point operations per second) and exascale systems are projected within seven years. But reliability is becoming one of the major challenges faced by exascale computing. With billion-core parallelism, the mean time to failure is projected to be in the range of minutes or hours instead of days. Failures are becoming the norm rather than the exception during execution of HPC applications. Current fault tolerance techniques in HPC focus on reactive ways to mitigate faults, namely via checkpoint and restart (C/R). Apart from storage overheads, C/R-based fault recovery comes at an additional cost in terms of application performance because normal execution is disrupted when checkpoints are taken. Studies have shown that applications running at a large scale spend more than 50\% of their total time saving checkpoints, restarting and redoing lost work. Redundancy is another fault tolerance technique, which employs redundant processes performing the same task. If a process fails, a replica of it can take over its execution. Thus, redundant copies can decrease the overall failure rate. The downside of redundancy is that extra resources are required and there is an additional overhead on communication and synchronization. This work contributes a model and analyzes the benefit of C/R in coordination with redundancy at different degrees to minimize the total wallclock time and resources utilization of HPC applications. We further conduct experiments with an implementation of redundancy within the MPI layer on a cluster. Our experimental results confirm the benefit of dual and triple redundancy - but not for partial redundancy - and show a close fit to the model. At 80,000 processes, dual redundancy requires twice the number of processing resources for an application but allows two jobs of 128 hours wallclock time to finish within the time of just one job without redundancy. For narrow ranges of processor counts, partial redundancy results in the lowest time. Once the count exceeds 770, 000, triple redundancy has the lowest overall cost. Thus, redundancy allows one to trade-off additional resource requirements against wallclock time, which provides a tuning knob for users to adapt to resource availabilities.", note = "Acceptance rate 13\% (71/515)" }
@conference{wang12nvmalloc, author = "Chao Wang and Sudharshan S. Vazhkudai and Xiaosong Ma and Fei Meng and Youngjae Kim and Christian Engelmann", title = "{NVMalloc}: Exposing an Aggregate {SSD} Store as a Memory Partition in Extreme-Scale Machines", booktitle = "Proceedings of the \href{http://www.ipdps.org} {$26^{th}$ IEEE International Parallel and Distributed Processing Symposium (IPDPS) 2012}", pages = "957--968", month = may # "~21-25, ", year = "2012", address = "Shanghai, China", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-4675-9", doi = "http://dx.doi.org/10.1109/IPDPS.2012.90", url = "http://www.christian-engelmann.info/publications/wang12nvmalloc.pdf", url2 = "http://www.christian-engelmann.info/publications/wang12nvmalloc.ppt.pdf", abstract = "DRAM is a precious resource in extreme-scale machines and is increasingly becoming scarce, mainly due to the growing number of cores per node. On future multi-petaflop and exaflop machines, the memory pressure is likely to be so severe that we need to rethink our memory usage models. Fortunately, the advent of non-volatile memory (NVM) offers a unique opportunity in this space. Current NVM offerings possess several desirable properties, such as low cost and power efficiency, but also suffer from high latency and lifetime issues. We need rich techniques to be able to use them alongside DRAM. In this paper, we propose a novel approach to exploiting NVM as a secondary memory partition so that applications can explicitly allocate and manipulate memory regions therein. More specifically, we propose an NVMalloc library with a suite of services that enables applications to access a distributed NVM storage system. We have devised ways within NVMalloc so that the storage system, built from compute node-local NVM devices, can be accessed in a byte-addressable fashion using the memory mapped I/O interface. Our approach has the potential to re-energize out-of-core computations on large-scale machines by having applications allocate certain variables through NVMalloc, thereby increasing the overall memory available for the application. Our evaluation on a 128-core cluster shows that NVMalloc enables applications to compute problem sizes larger than the physical memory in a cost-effective manner. It can achieve better performance with increased computation time between NVM memory accesses or increased data access locality. In addition, our results suggest that while NVMalloc enables transparent access to NVM-resident variables, the explicit control it provides is crucial to optimize application performance.", note = "Acceptance rate 21\% (118/569)" }
@conference{boehm12file, author = "Swen B{\"o}hm and Christian Engelmann", title = "File {I/O} for {MPI} Applications in Redundant Execution Scenarios", booktitle = "Proceedings of the \href{http://www.pdp2012.org}{$20^{th}$ Euromicro International Conference on Parallel, Distributed, and network-based Processing (PDP) 2012}", pages = "112-119", month = feb # "~15-17, ", year = "2012", address = "Garching, Germany", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-4633-9", issn = "1066-6192", doi = "http://dx.doi.org/10.1109/PDP.2012.22", url = "http://www.christian-engelmann.info/publications/boehm12file.pdf", url2 = "http://www.christian-engelmann.info/publications/boehm12file.ppt.pdf", abstract = "As multi-petascale and exa-scale high-performance computing (HPC) systems inevitably have to deal with a number of resilience challenges, such as a significant growth in component count and smaller circuit sizes with lower circuit voltages, redundancy may offer an acceptable level of resilience that traditional fault tolerance techniques, such as checkpoint/restart, do not. Although redundancy in HPC is quite controversial due to the associated cost for redundant components, the constantly increasing number of cores-per-processor is tilting this cost calculation toward a system design where computation, such as for redundancy, is much cheaper and communication, needed for checkpoint/restart, is much more expensive. Recent research and development activities in redundancy for Message Passing Interface (MPI) applications focused on availability/reliability models and replication algorithms. This paper takes a first step toward solving an open research problem associated with running a parallel application redundantly, which is file I/O under redundancy. The approach intercepts file I/O calls made by a redundant application to employ coordination protocols that execute file I/O operations in a redundancy-oblivious fashion when accessing a node-local file system, or in a redundancy-aware fashion when accessing a shared networked file system. A proof-of concept prototype is presented and a number of coordination protocols are described and evaluated. The results show the performance impact for redundantly accessing a shared networked file system, but also demonstrate the capability to regain performance by utilizing MPI communication between replicas and parallel file I/O." }
@conference{boehm11xsim, author = "Swen B{\"o}hm and Christian Engelmann", title = "{xSim}: {The} Extreme-Scale Simulator", booktitle = "Proceedings of the \href{http://hpcs11.cisedu.info}{International Conference on High Performance Computing and Simulation (HPCS) 2011}", pages = "280-286", month = jul # "~4-8, ", year = "2011", address = "Istanbul, Turkey", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-1-61284-383-4", doi = "http://dx.doi.org/10.1109/HPCSim.2011.5999835", url = "http://www.christian-engelmann.info/publications/boehm11xsim.pdf", url2 = "http://www.christian-engelmann.info/publications/boehm11xsim.ppt.pdf", abstract = "Investigating parallel application performance properties at scale is becoming an important part of high-performance computing (HPC) application development and deployment. The Extreme-scale Simulator (xSim) is a performance investigation toolkit that permits running an application in a controlled environment at extreme scale without the need for a respective extreme-scale HPC system. Using a lightweight parallel discrete event simulation, xSim executes a parallel application with a virtual wall clock time, such that performance data can be extracted based on a processor model and a network model. This paper presents significant enhancements to the xSim toolkit prototype that provide a more complete Message Passing Interface (MPI) support and improve its versatility. These enhancements include full virtual MPI group, communicator and collective communication support, and global variables support. The new capabilities are demonstrated by executing the entire NAS Parallel Benchmark suite in a simulated HPC environment.", note = "Acceptance rate 28.1\% (48/171)" }
@conference{engelmann11redundant, author = "Christian Engelmann and Swen B{\"o}hm", title = "Redundant Execution of {HPC} Applications with {MR-MPI}", booktitle = "Proceedings of the \href{http://www.iasted.org/conferences/home-719.html} {$10^{th}$ IASTED International Conference on Parallel and Distributed Computing and Networks (PDCN) 2011}", pages = "31--38", month = feb # "~15-17, ", year = "2011", address = "Innsbruck, Austria", publisher = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB, Canada}", isbn = "978-0-88986-864-9", doi = "http://dx.doi.org/10.2316/P.2011.719-031", url = "http://www.christian-engelmann.info/publications/engelmann11redundant.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann11redundant.ppt.pdf", abstract = "This paper presents a modular-redundant Message Passing Interface (MPI) solution, MR-MPI, for transparently executing high-performance computing (HPC) applications in a redundant fashion. The presented work addresses the deficiencies of recovery-oriented HPC, i.e., checkpoint/restart to/from a parallel file system, at extreme scale by adding the redundancy approach to the HPC resilience portfolio. It utilizes the MPI performance tool interface, PMPI, to transparently intercept MPI calls from an application and to hide all redundancy-related mechanisms. A redundantly executed application runs with $r*m$ native MPI processes, where $r$ is the number of MPI ranks visible to the application and $m$ is the replication degree. Messages between redundant nodes are replicated. Partial replication for tunable resilience is supported. The performance results clearly show the negative impact of the O(m^2) messages between replicas. For low-level, point-to-point benchmarks, the impact can be as high as the replication degree. For applications, performance highly depends on the actual communication types and counts. On single-core systems, the overhead can be 0\% for embarrassingly parallel applications independent of the employed redundancy configuration or up to 70-90\% for communication-intensive applications in a dual-redundant configuration. On multi-core systems, the overhead can be significantly higher due to the additional communication contention." }
@conference{wang10hybrid2, author = "Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Hybrid Checkpointing for {MPI} Jobs in {HPC} Environments", booktitle = "Proceedings of the \href{http://grid.sjtu.edu.cn/icpads10}{$16^{th}$ IEEE International Conference on Parallel and Distributed Systems (ICPADS) 2010}", pages = "524--533", month = dec # "~8-10, ", year = "2010", address = "Shanghai, China", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-4307-9", doi = "http://dx.doi.org/10.1109/ICPADS.2010.48", url = "http://www.christian-engelmann.info/publications/wang10hybrid2.pdf", url2 = "http://www.christian-engelmann.info/publications/wang10hybrid2.ppt.pdf", abstract = "As the core count in high-performance computing systems keeps increasing, faults are becoming common place. Check pointing addresses such faults but captures full process images even though only a subset of the process image changes between checkpoints. We have designed a hybrid check pointing technique for MPI tasks of high-performance applications. This technique alternates between full and incremental checkpoints: At incremental checkpoints, only data changed since the last checkpoint is captured. Our implementation integrates new BLCR and LAM/MPI features that complement traditional full checkpoints. This results in significantly reduced checkpoint sizes and overheads with only moderate increases in restart overhead. After accounting for cost and savings, benefits due to incremental checkpoints are an order of magnitude larger than overheads on restarts. We further derive qualitative results indicating an optimal balance between full/incremental checkpoints of our novel approach at a ratio of 1:9, which outperforms both always-full and always-incremental check pointing.", note = "Acceptance rate 29.6\% (77/188)" }
@conference{li10functional, author = "Min Li and Sudharshan S. Vazhkudai and Ali R. Butt and Fei Meng and Xiaosong Ma and Youngjae Kim and Christian Engelmann and Galen Shipman", title = "Functional Partitioning to Optimize End-to-End Performance on Many-Core Architectures", booktitle = "Proceedings of the \href{http://sc10.supercomputing.org}{$23^{rd}$ IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2010}", pages = "1-12", month = nov # "~13-19, ", year = "2010", address = "New Orleans, LA, USA", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", isbn = "978-1-4244-7559-9", doi = "http://dx.doi.org/10.1109/SC.2010.28", url = "http://www.christian-engelmann.info/publications/li10functional.pdf", url2 = "http://www.christian-engelmann.info/publications/li10functional.ppt.pdf", abstract = "Scaling computations on emerging massive-core supercomputers is a daunting task, which coupled with the significantly lagging system I/O capabilities exacerbates applications' end-to-end performance. The I/O bottleneck often negates potential performance benefits of assigning additional compute cores to an application. In this paper, we address this issue via a novel functional partitioning (FP) runtime environment that allocates cores to specific application tasks - checkpointing, de-duplication, and scientific data format transformation - so that the deluge of cores can be brought to bear on the entire gamut of application activities. The focus is on utilizing the extra cores to support HPC application I/O activities and also leverage solid-state disks in this context. For example, our evaluation shows that dedicating 1 core on an oct-core machine for checkpointing and its assist tasks using FP can improve overall execution time of a FLASH benchmark on 80 and 160 cores by 43.95\% and 41.34\%, respectively.", note = "Acceptance rate 19.8\% (50/253)" }
@conference{boehm10aggregation, author = "Swen B{\"o}hm and Christian Engelmann and Stephen L. Scott", title = "Aggregation of Real-Time System Monitoring Data for Analyzing Large-Scale Parallel and Distributed Computing Environments", booktitle = "Proceedings of the \href{http://www.anss.org.au/hpcc2010} {$12^{th}$ IEEE International Conference on High Performance Computing and Communications (HPCC) 2010}", pages = "72--78", month = sep # "~1-3, ", year = "2010", address = "Melbourne, Australia", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-4214-0", doi = "http://doi.ieeecomputersociety.org/10.1109/HPCC.2010.32", url = "http://www.christian-engelmann.info/publications/boehm10aggregation.pdf", url2 = "http://www.christian-engelmann.info/publications/boehm10aggregation.ppt.pdf", abstract = "We present a monitoring system for large-scale parallel and distributed computing environments that allows to trade-off accuracy in a tunable fashion to gain scalability without compromising fidelity. The approach relies on classifying each gathered monitoring metric based on individual needs and on aggregating messages containing classes of individual monitoring metrics using a tree-based overlay network. The MRNet-based prototype is able to significantly reduce the amount of gathered and stored monitoring data, e.g., by a factor of ~56 in comparison to the Ganglia distributed monitoring system. A simple scaling study reveals, however, that further efforts are needed in reducing the amount of data to monitor future-generation extreme-scale systems with up to 1,000,000 nodes. The implemented solution did not had a measurable performance impact as the 32-node test system did not produce enough monitoring data to interfere with running applications.", note = "Acceptance rate 19.1\% (58/304)" }
@conference{litvinova10proactive, author = "Antonina Litvinova and Christian Engelmann and Stephen L. Scott", title = "A Proactive Fault Tolerance Framework for High-Performance Computing", booktitle = "Proceedings of the \href{http://www.iasted.org/conferences/home-676.html} {$9^{th}$ IASTED International Conference on Parallel and Distributed Computing and Networks (PDCN) 2010}", pages = "", month = feb # "~16-18, ", year = "2010", address = "Innsbruck, Austria", publisher = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB, Canada}", isbn = "978-0-88986-783-3", doi = "http://www.actapress.com/Abstract.aspx?paperId=37915", url = "http://www.christian-engelmann.info/publications/litvinova10proactive.pdf", url2 = "http://www.christian-engelmann.info/publications/litvinova10proactive.ppt.pdf", abstract = "As high-performance computing (HPC) systems continue to increase in scale, their mean-time to interrupt decreases respectively. The current state of practice for fault tolerance (FT) is checkpoint/restart. However, with increasing error rates, increasing aggregate memory and not proportionally increasing I/O capabilities, it is becoming less efficient. Proactive FT avoids experiencing failures through preventative measures, such as by migrating application parts away from nodes that are about to fail. This paper presents a proactive FT framework that performs environmental monitoring, event logging, parallel job monitoring and resource monitoring to analyze HPC system reliability and to perform FT through such preventative actions." }
@conference{taerat09blue, author = "Narate Taerat and Nichamon Naksinehaboon and Clayton Chandler and James Elliott and Chokchai (Box) Leangsuksun and George Ostrouchov and Stephen L. Scott and Christian Engelmann", title = "{Blue Gene/L} Log Analysis and Time to Interrupt Estimation", booktitle = "Proceedings of the \href{http://www.ares-conference.eu/ares2009}{$4^{th}$ International Conference on Availability, Reliability and Security (ARES) 2009}", pages = "173--180", month = mar # "~16-19, ", year = "2009", address = "Fukuoka, Japan", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-1-4244-3572-2", doi = "http://doi.ieeecomputersociety.org/10.1109/ARES.2009.105", url = "http://www.christian-engelmann.info/publications/taerat09blue.pdf", url2 = "", abstract = "System- and application-level failures could be characterized by analyzing relevant log files. The resulting data might then be used in numerous studies on and future developments for the mission-critical and large scale computational architecture, including fields such as failure prediction, reliability modeling, performance modeling and power awareness. In this paper, system logs covering a six month period of the Blue Gene/L supercomputer were obtained and subsequently analyzed. Temporal filtering was applied to remove duplicated log messages. Optimistic and pessimistic perspectives were exerted on filtered log information to observe failure behavior within the system. Further, various time to repair factors were applied to obtain application time to interrupt, which will be exploited in further resilience modeling research.", note = "Acceptance rate 25\% (40/160)" }
@conference{engelmann09evaluating, author = "Christian Engelmann and Hong H. Ong and Stephen L. Scott", title = "Evaluating the Shared Root File System Approach for Diskless High-Performance Computing Systems", booktitle = "Proceedings of the \href{http://www.linuxclustersinstitute.org/conferences} {$10^{th}$ LCI International Conference on High-Performance Clustered Computing (LCI) 2009}", month = mar # "~9-12, ", year = "2009", address = "Boulder, CO, USA", url = "http://www.christian-engelmann.info/publications/engelmann09evaluating.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann09evaluating.ppt.pdf", abstract = "Diskless high-performance computing (HPC) systems utilizing networked storage have become popular in the last several years. Removing disk drives significantly increases compute node reliability as they are known to be a major source of failures. Furthermore, networked storage solutions utilizing parallel I/O and replication are able to provide increased scalability and availability. Reducing a compute node to processor(s), memory and network interface(s) greatly reduces its physical size, which in turn allows for large-scale dense HPC solutions. However, one major obstacle is the requirement by certain operating systems (OSs), such as Linux, for a root file system. While one solution is to remove this requirement from the OS, another is to share the root file system over the networked storage. This paper evaluates three networked file system solutions, NFSv4, Lustre and PVFS2, with respect to their performance, scalability, and availability features for servicing a common root file system in a diskless HPC configuration. Our findings indicate that Lustre is a viable solution as it meets both, scaling and performance requirements. However, certain availability issues regarding single points of failure and control need to be considered." }
@conference{engelmann09proactive, author = "Christian Engelmann and Geoffroy R. Vall\'ee and Thomas Naughton and Stephen L. Scott", title = "Proactive Fault Tolerance Using Preemptive Migration", booktitle = "Proceedings of the \href{http://www.pdp2009.org}{$17^{th}$ Euromicro International Conference on Parallel, Distributed, and network-based Processing (PDP) 2009}", pages = "252--257", month = feb # "~18-20, ", year = "2009", address = "Weimar, Germany", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-3544-9", issn = "1066-6192", doi = "http://doi.ieeecomputersociety.org/10.1109/PDP.2009.31", url = "http://www.christian-engelmann.info/publications/engelmann09proactive.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann09proactive.ppt.pdf", abstract = "Proactive fault tolerance (FT) in high-performance computing is a concept that prevents compute node failures from impacting running parallel applications by preemptively migrating application parts away from nodes that are about to fail. This paper provides a foundation for proactive FT by defining its architecture and classifying implementation options. This paper further relates prior work to the presented architecture and classification, and discusses the challenges ahead for needed supporting technologies.", note = "Acceptance rate 42\%" }
@conference{valentini09high, author = "Alessandro Valentini and Christian Di Biagio and Fabrizio Batino and Guido Pennella and Fabrizio Palma and Christian Engelmann", title = "High Performance Computing with {Harness} over {InfiniBand}", booktitle = "Proceedings of the \href{http://www.pdp2009.org}{$17^{th}$ Euromicro International Conference on Parallel, Distributed, and network-based Processing (PDP) 2009}", pages = "151--154", month = feb # "~18-20, ", year = "2009", address = "Weimar, Germany", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-3544-9", issn = "1066-6192", doi = "http://doi.ieeecomputersociety.org/10.1109/PDP.2009.64", url = "http://www.christian-engelmann.info/publications/valentini09high.pdf", abstract = "Harness is an adaptable and plug-in-based middleware framework able to support distributed parallel computing. By now, it is based on the Ethernet protocol which cannot guarantee high performance throughput and Real Time (determinism) performance. During last years, both the research and industry environments have developed both new network architectures (InfiniBand, Myrinet, iWARP, etc.) to avoid those limits. This paper concerns the integration between Harness and InfiniBand focusing on two solutions: IP over InfiniBand (IPoIB) and Socket Direct Protocol (SDP) technology. Those allow Harness middleware to take advantage of the enhanced features provided by InfiniBand.", note = "Acceptance rate 42\%" }
@conference{engelmann09case, author = "Christian Engelmann and Hong H. Ong and Stephen L. Scott", title = "The Case for Modular Redundancy in Large-Scale High Performance Computing Systems", booktitle = "Proceedings of the \href{http://www.iasted.org/conferences/home-641.html} {$8^{th}$ IASTED International Conference on Parallel and Distributed Computing and Networks (PDCN) 2009}", pages = "189--194", month = feb # "~16-18, ", year = "2009", address = "Innsbruck, Austria", publisher = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB, Canada}", isbn = "978-0-88986-784-0", doi = "http://www.actapress.com/Abstract.aspx?paperId=34612", url = "http://www.christian-engelmann.info/publications/engelmann09case.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann09case.ppt.pdf", abstract = "Recent investigations into resilience of large-scale high-performance computing (HPC) systems showed a continuous trend of decreasing reliability and availability. Newly installed systems have a lower mean-time to failure (MTTF) and a higher mean-time to recover (MTTR) than their predecessors. Modular redundancy is being used in many mission critical systems today to provide for resilience, such as for aerospace and command & control systems. The primary argument against modular redundancy for resilience in HPC has always been that the capability of a HPC system, and respective return on investment, would be significantly reduced. We argue that modular redundancy can significantly increase compute node availability as it removes the impact of scale from single compute node MTTR. We further argue that single compute nodes can be much less reliable, and therefore less expensive, and still be highly available, if their MTTR/MTTF ratio is maintained." }
@conference{wang08proactive, author = "Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Proactive Process-Level Live Migration in {HPC} Environments", booktitle = "Proceedings of the \href{http://sc08.supercomputing.org} {$21^{st}$ IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2008}", pages = "1--12", month = nov # "~15-21, ", year = "2008", address = "Austin, TX, USA", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", isbn = "978-1-4244-2835-9", doi = "http://doi.acm.org/10.1145/1413370.1413414", url = "http://www.christian-engelmann.info/publications/wang08proactive.pdf", url2 = "http://www.christian-engelmann.info/publications/wang08proactive.ppt.pdf", abstract = "As the number of nodes in high-performance computing environments keeps increasing, faults are becoming common place. Reactive fault tolerance (FT) often does not scale due to massive I/O requirements and relies on manual job resubmission. This work complements reactive with proactive FT at the process level. Through health monitoring, a subset of node failures can be anticipated when one's health deteriorates. A novel process-level live migration mechanism supports continued execution of applications during much of processes migration. This scheme is integrated into an MPI execution environment to transparently sustain health-inflicted node failures, which eradicates the need to restart and requeue MPI jobs. Experiments indicate that 1-6.5 seconds of prior warning are required to successfully trigger live process migration while similar operating system virtualization mechanisms require 13-24 seconds. This self-healing approach complements reactive FT by nearly cutting the number of checkpoints in half when 70\% of the faults are handled proactively.", note = "Acceptance rate 21.3\% (59/277)" }
@conference{engelmann08symmetric, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Symmetric Active/Active Replication for Dependent Services", booktitle = "Proceedings of the \href{http://www.ares-conference.eu/ares2008}{$3^{rd}$ International Conference on Availability, Reliability and Security (ARES) 2008}", pages = "260--267", month = mar # "~4-7, ", year = "2008", address = "Barcelona, Spain", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-3102-1", doi = "http://doi.ieeecomputersociety.org/10.1109/ARES.2008.64", url = "http://www.christian-engelmann.info/publications/engelmann08symmetric.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann08symmetric.ppt.pdf", abstract = "During the last several years, we have established the symmetric active/active replication model for service-level high availability and implemented several proof-of-concept prototypes. One major deficiency of our model is its inability to deal with dependent services, since its original architecture is based on the client-service model. This paper extends our model to dependent services using its already existing mechanisms and features. The presented concept is based on the idea that a service may also be a client of another service, and multiple services may be clients of each other. A high-level abstraction is used to illustrate dependencies between clients and services, and to decompose dependencies between services into respective client-service dependencies. This abstraction may be used for providing high availability in distributed computing systems with complex service-oriented architectures.", note = "Acceptance rate 21.1\% (40/190)" }
@conference{vallee08framework, author = "Geoffroy R. Vall\'ee and Kulathep Charoenpornwattana and Christian Engelmann and Anand Tikotekar and Chokchai (Box) Leangsuksun and Thomas Naughton and Stephen L. Scott", title = "A Framework For Proactive Fault Tolerance", booktitle = "Proceedings of the \href{http://www.ares-conference.eu/ares2008}{$3^{rd}$ International Conference on Availability, Reliability and Security (ARES) 2008}", pages = "659--664", month = mar # "~4-7, ", year = "2008", address = "Barcelona, Spain", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-3102-1", doi = "http://doi.ieeecomputersociety.org/10.1109/ARES.2008.171", url = "http://www.christian-engelmann.info/publications/vallee08framework.pdf", url2 = "http://www.christian-engelmann.info/publications/vallee08framework.ppt.pdf", abstract = "Fault tolerance is a major concern to guarantee availability of critical services as well as application execution. Traditional approaches for fault tolerance include checkpoint/restart or duplication. However it is also possible to anticipate failures and proactively take action before failures occur in order to minimize failure impact on the system and application execution. This document presents a proactive fault tolerance framework. This framework can use different proactive fault tolerance mechanisms, i.e. migration and pause/unpause. The framework also allows the implementation of new proactive fault tolerance policies thanks to a modular architecture. A first proactive fault tolerance policy has been implemented and preliminary experimentations have been done based on system-level virtualization and compared with results obtained by simulation.", note = "Acceptance rate 21.1\% (40/190)" }
@conference{koenning08virtualized, author = "Bj{\"o}rn K{\"o}nning and Christian Engelmann and Stephen L. Scott and George A. (Al) Geist", title = "Virtualized Environments for the {Harness} High Performance Computing Workbench", booktitle = "Proceedings of the \href{http://www.pdp2008.org}{$16^{th}$ Euromicro International Conference on Parallel, Distributed, and network-based Processing (PDP) 2008}", pages = "133--140", month = feb # "~13-15, ", year = "2008", address = "Toulouse, France", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-3089-5", doi = "http://doi.ieeecomputersociety.org/10.1109/PDP.2008.14", url = "http://www.christian-engelmann.info/publications/koenning08virtualized.pdf", url2 = "http://www.christian-engelmann.info/publications/koenning08virtualized.ppt.pdf", abstract = "This paper describes recent accomplishments in providing a virtualized environment concept and prototype for scientific application development and deployment as part of the Harness High Performance Computing (HPC) Workbench research effort. The presented work focuses on tools and mechanisms that simplify scientific application development and deployment tasks, such that only minimal adaptation is needed when moving from one HPC system to another or after HPC system upgrades. The overall technical approach focuses on the concept of adapting the HPC system environment to the actual needs of individual scientific applications instead of the traditional scheme of adapting scientific applications to individual HPC system environment properties. The presented prototype implementation is based on the mature and lightweight chroot virtualization approach for Unix-type systems with a focus on virtualized file system structure and virtualized shell environment variables utilizing virtualized environment configuration descriptions in Extensible Markup Language (XML) format. The presented work can be easily extended to other virtualization technologies, such as system-level virtualization solutions using hypervisors.", note = "Acceptance rate 40\%" }
@conference{vallee08system, author = "Geoffroy R. Vall\'ee and Thomas Naughton and Christian Engelmann and Hong H. Ong and Stephen L. Scott", title = "System-level Virtualization for High Performance Computing", booktitle = "Proceedings of the \href{http://www.pdp2008.org}{$16^{th}$ Euromicro International Conference on Parallel, Distributed, and network-based Processing (PDP) 2008}", pages = "636--643", month = feb # "~13-15, ", year = "2008", address = "Toulouse, France", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-3089-5", doi = "http://doi.ieeecomputersociety.org/10.1109/PDP.2008.85", url = "http://www.christian-engelmann.info/publications/vallee08system.pdf", url2 = "http://www.christian-engelmann.info/publications/vallee08system.ppt.pdf", abstract = "System-level virtualization has been a research topic since the 70`s but regained popularity during the past few years because of the availability of efficient solution such as Xen and the implementation of hardware support in commodity processors (e.g. Intel-VT, AMD-V). However, a majority of system-level virtualization projects is guided by the server consolidation market. As a result, current virtualization solutions appear to not be suitable for high performance computing (HPC) which is typically based on large-scale systems. On another hand there is significant interest in exploiting virtual machines (VMs) within HPC for a number of other reasons. By virtualizing the machine, one is able to run a variety of operating systems and environments as needed by the applications. Virtualization allows users to isolate workloads, improving security and reliability. It is also possible to support non-native environments and/or legacy operating environments through virtualization. In addition, it is possible to balance work loads, use migration techniques to relocate applications from failing machines, and isolate fault systems for repair. This document presents the challenges for the implementation of a system-level virtualization solution for HPC. It also presents a brief survey of the different approaches and techniques to address these challenges.", note = "Acceptance rate 40\%" }
@conference{ou07symmetric, author = "Li Ou and Christian Engelmann and Xubin (Ben) He and Xin Chen and Stephen L. Scott", title = "Symmetric Active/Active Metadata Service for Highly Available Cluster Storage Systems", booktitle = "Proceedings of the \href{http://www.iasted.org/conferences/home-590.html} {$19^{th}$ IASTED International Conference on Parallel and Distributed Computing and Systems (PDCS) 2007}", pages = "", month = nov # "~19-21, ", year = "2007", address = "Cambridge, MA, USA", publisher = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB, Canada}", isbn = "978-0-88986-703-1", doi = "http://www.actapress.com/Abstract.aspx?paperId=32008", url = "http://www.christian-engelmann.info/publications/ou07symmetric.pdf", url2 = "http://www.christian-engelmann.info/publications/ou07symmetric.ppt.pdf", abstract = "In a typical distributed storage system, metadata is stored and managed by dedicated metadata servers. One way to improve the availability of distributed storage systems is to deploy multiple metadata servers. Past research focused on the active/standby model, where each active server has at least one redundant idle backup. However, interruption of service and loss of service state may occur during a fail-over depending on the used replication technique. The research in this paper targets the symmetric active/active replication model using multiple redundant service nodes running in virtual synchrony. In this model, service node failures do not cause a fail-over to a backup and there is no disruption of service or loss of service state. We propose a fast delivery protocol to reduce the latency of total order broadcast. Our prototype implementation shows that high availability of metadata servers can be achieved with an acceptable performance trade-off using the active/active metadata server solution.", note = "Acceptance rate 49\%" }
@conference{disaverio07distributed, author = "Emanuele Di Saverio and Marco Cesati and Christian Di Biagio and Guido Pennella and Christian Engelmann", title = "Distributed Real-Time Computing with {Harness}", booktitle = "Lecture Notes in Computer Science: Proceedings of the \href{http://pvmmpi07.lri.fr}{$14^{th}$ European PVM/MPI Users` Group Meeting (EuroPVM/MPI) 2007}", pages = "281--288", volume = "4757", month = sep # "~30 - " # oct # "~3, ", year = "2007", address = "Paris, France", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "978-3-540-75415-2", issn = "0302-9743", doi = "http://dx.doi.org/10.1007/978-3-540-75416-9_39", url = "http://www.christian-engelmann.info/publications/disaverio07distributed.pdf", url2 = "http://www.christian-engelmann.info/publications/disaverio07distributed.ppt.pdf", abstract = "Modern parallel and distributed computing solutions are often built onto a middleware software layer providing a higher and common level of service between computational nodes. Harness is an adaptable, plugin-based middleware framework for parallel and distributed computing. This paper reports recent research and development results of using Harness for real-time distributed computing applications in the context of an industrial environment with the needs to perform several safety critical tasks. The presented work exploits the modular architecture of Harness in conjunction with a lightweight threaded implementation to resolve several real-time issues by adding three new Harness plug-ins to provide a prioritized lightweight execution environment, low latency communication facilities, and local timestamped event logging." }
@conference{ou07fast, author = "Li Ou and Xubin (Ben) He and Christian Engelmann and Stephen L. Scott", title = "A Fast Delivery Protocol for Total Order Broadcasting", booktitle = "Proceedings of the \href{http://www.icccn.org/icccn07} {$16^{th}$ IEEE International Conference on Computer Communications and Networks (ICCCN) 2007}", pages = "730--734", month = aug # "~13-16, ", year = "2007", address = "Honolulu, HI, USA", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-1-42441-251-8", issn = "1095-2055", doi = "http://doi.ieeecomputersociety.org/10.1109/ICCCN.2007.4317904", url = "http://www.christian-engelmann.info/publications/ou07fast.pdf", url2 = "http://www.christian-engelmann.info/publications/ou07fast.ppt.pdf", abstract = "Sequencer, privilege-based, and communication history algorithms are popular approaches to implement total ordering, where communication history algorithms are most suitable for parallel computing systems, because they provide best performance under heavy work load. Unfortunately, post-transmission delay of communication history algorithms is most apparent when a system is idle. In this paper, we propose a fast delivery protocol to reduce the latency of message ordering. The protocol optimizes the total ordering process by waiting for messages only from a subset of the machines in the group, and by fast acknowledging messages on behalf of other machines. Our test results indicate that the fast delivery protocol is suitable for both idle and heavy load systems, while reducing the latency of message ordering.", note = "Acceptance rate 29.1\% (160/550)" }
@conference{nagarajan07proactive, author = "Arun B. Nagarajan and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Proactive Fault Tolerance for {HPC} with {Xen} Virtualization", booktitle = "Proceedings of the \href{http://ics07.ac.upc.edu}{$21^{st}$ ACM International Conference on Supercomputing (ICS) 2007}", pages = "23--32", month = jun # "~16-20, ", year = "2007", address = "Seattle, WA, USA", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", isbn = "978-1-59593-768-1", doi = "http://doi.acm.org/10.1145/1274971.1274978", url = "http://www.christian-engelmann.info/publications/nagarajan07proactive.pdf", url2 = "http://www.christian-engelmann.info/publications/nagarajan07proactive.ppt.pdf", abstract = "Large-scale parallel computing is relying increasingly on clusters with thousands of processors. At such large counts of compute nodes, faults are becoming common place. Current techniques to tolerate faults focus on reactive schemes to recover from faults and generally rely on a checkpoint/restart mechanism. Yet, in today`s systems, node failures can often be anticipated by detecting a deteriorating health status. Instead of a reactive scheme for fault tolerance (FT), we are promoting a proactive one where processes automatically migrate from unhealthy nodes to healthy ones. Our approach relies on operating system virtualization techniques exemplified by but not limited to Xen. This paper contributes an automatic and transparent mechanism for proactive FT for arbitrary MPI applications. It leverages virtualization techniques combined with health monitoring and load-based migration. We exploit Xen`s live migration mechanism for a guest operating system (OS) to migrate an MPI task from a health-deteriorating node to a healthy one without stopping the MPI task during most of the migration. Our proactive FT daemon orchestrates the tasks of health monitoring, load determination and initiation of guest OS migration. Experimental results demonstrate that live migration hides migration costs and limits the overhead to only a few seconds making it an attractive approach to realize FT in HPC systems. Overall, our enhancements make proactive FT a valuable asset for long-running MPI application that is complementary to reactive FT using full checkpoint/restart schemes since checkpoint frequencies can be reduced as fewer unanticipated failures are encountered. In the context of OS virtualization, we believe that this is the first comprehensive study of proactive fault tolerance where live migration is actually triggered by health monitoring.", note = "Acceptance rate 23.6\% (29/123). Most cited paper with 178 citations" }
@conference{engelmann07programming, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "On Programming Models for Service-Level High Availability", booktitle = "Proceedings of the \href{http://www.ares-conference.eu/ares2007}{$2^{nd}$ International Conference on Availability, Reliability and Security (ARES) 2007}", pages = "999--1006", month = apr # "~10-13, ", year = "2007", address = "Vienna, Austria", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "0-7695-2775-2", doi = "http://doi.ieeecomputersociety.org/10.1109/ARES.2007.109", url = "http://www.christian-engelmann.info/publications/engelmann07programming.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann07programming.ppt.pdf", abstract = "This paper provides an overview of existing programming models for service-level high availability and investigates their differences, similarities, advantages, and disadvantages. Its goal is to help to improve reuse of code and to allow adaptation to quality of service requirements by using a uniform programming model description. It further aims at encouraging a discussion about these programming models and their provided quality of service, such as availability, performance, serviceability, usability, and applicability. Within this context, the presented research focuses on providing high availability for services running on head and service nodes of high-performance computing systems.", note = "Acceptance rate 28.3\% (60/212)" }
@conference{wang07job, author = "Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "A Job Pause Service under {LAM/MPI+BLCR} for Transparent Fault Tolerance", booktitle = "Proceedings of the \href{http://www.ipdps.org/ipdps2007} {$21^{st}$ IEEE International Parallel and Distributed Processing Symposium (IPDPS) 2007}", pages = "1-10", month = mar # "~26-30, ", year = "2007", address = "Long Beach, CA, USA", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", isbn = "978-1-59593-768-1", doi = "http://doi.ieeecomputersociety.org/10.1109/IPDPS.2007.370307", url = "http://www.christian-engelmann.info/publications/wang07job.pdf", url2 = "http://www.christian-engelmann.info/publications/wang07job.ppt.pdf", abstract = "Checkpoint/restart (C/R) has become a requirement for long-running jobs in large-scale clusters due to a mean-time-to-failure (MTTF) in the order of hours. After a failure, C/R mechanisms generally require a complete restart of an MPI job from the last checkpoint. A complete restart, however, is unnecessary since all but one node are typically still alive. Furthermore, a restart may result in lengthy job requeuing even though the original job had not exceeded its time quantum. In this paper, we overcome these shortcomings. Instead of job restart, we have developed a transparent mechanism for job pause within LAM/MPI+BLCR. This mechanism allows live nodes to remain active and roll back to the last checkpoint while failed nodes are dynamically replaced by spares before resuming from the last checkpoint. Our methodology includes LAM/MPI enhancements in support of scalable group communication with fluctuating number of nodes, reuse of network connections, transparent coordinated checkpoint scheduling and a BLCR enhancement for job pause. Experiments in a cluster with the NAS Parallel Benchmark suite show that our overhead for job pause is comparable to that of a complete job restart. A minimal overhead of 5.6\% is only incurred in case migration takes place while the regular checkpoint overhead remains unchanged. Yet, our approach alleviates the need to reboot the LAM run-time environment, which accounts for considerable overhead resulting in net savings of our scheme in the experiments. Our solution further provides full transparency and automation with the additional benefit of reusing existing resources. Executing continues after failures within the scheduled job, {\em \textit{i.e.}}, the application staging overhead is not incurred again in contrast to a restart. Our scheme offers additional potential for savings through incremental checkpointing and proactive diskless live migration, which we are currently working on.", note = "Acceptance rate 26\% (109/419)" }
@conference{uhlemann06joshua, author = "Kai Uhlemann and Christian Engelmann and Stephen L. Scott", title = "{JOSHUA}: {S}ymmetric Active/Active Replication for Highly Available {HPC} Job and Resource Management", booktitle = "Proceedings of the \href{http://cluster2006.org}{$8^{th}$ IEEE International Conference on Cluster Computing (Cluster) 2006}", pages = "1-10", month = sep # "~25-28, ", year = "2006", address = "Barcelona, Spain", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "1-4244-0328-6", issn = "1552-5244", doi = "http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2006.311855", url = "http://www.christian-engelmann.info/publications/uhlemann06joshua.pdf", url2 = "http://www.christian-engelmann.info/publications/uhlemann06joshua.ppt.pdf", abstract = "Most of today`s HPC systems employ a single head node for control, which represents a single point of failure as it interrupts an entire HPC system upon failure. Furthermore, it is also a single point of control as it disables an entire HPC system until repair. One of the most important HPC system service running on the head node is the job and resource management. If it goes down, all currently running jobs loose the service they report back to. They have to be restarted once the head node is up and running again. With this paper, we present a generic approach for providing symmetric active/active replication for highly available HPC job and resource management. The JOSHUA solution provides a virtually synchronous environment for continuous availability without any interruption of service and without any loss of state. Replication is performed externally via the PBS service interface without the need to modify any service code. Test results as well as availability analysis of our proof-of-concept prototype implementation show that continuous availability can be provided by JOSHUA with an acceptable performance trade-off.", note = "Acceptance rate 33.1\% (42/127)" }
@conference{baumann06parallel, author = "Ronald Baumann and Christian Engelmann and George A. (Al) Geist", title = "A Parallel Plug-in Programming Paradigm", booktitle = "Lecture Notes in Computer Science: Proceedings of the \href{http://hpcc06.lrr.in.tum.de}{$7^{th}$ International Conference on High Performance Computing and Communications (HPCC) 2006}", volume = "4208", pages = "823--832", month = sep # "~13-15, ", year = "2006", address = "Munich, Germany", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "978-3-540-39368-9", issn = "0302-9743", doi = "http://dx.doi.org/10.1007/11847366_85", url = "http://www.christian-engelmann.info/publications/baumann06parallel.pdf", url2 = "http://www.christian-engelmann.info/publications/baumann06parallel.ppt.pdf", abstract = "Software component architectures allow assembly of applications from individual software modules based on clearly defined programming interfaces, thus improving the reuse of existing solutions and simplifying application development. Furthermore, the plug-in programming paradigm additionally enables runtime reconfigurability, making it possible to adapt to changing application needs, such as different application phases, and system properties, like resource availability, by loading/unloading appropriate software modules. Similar to parallel programs, parallel plug-ins are an abstraction for a set of cooperating individual plug-ins within a parallel application utilizing a software component architecture. Parallel programming paradigms apply to parallel plug-ins in the same way they apply to parallel programs. The research presented in this paper targets the clear definition of parallel plug-ins and the development of a parallel plug-in programming paradigm." }
@conference{varma06scalable, author = "Jyothish Varma and Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Scalable, Fault-Tolerant Membership for {MPI} Tasks on {HPC} Systems", booktitle = "Proceedings of the \href{http://www.ics-conference.org/2006} {$20^{th}$ ACM International Conference on Supercomputing (ICS) 2006}", pages = "219--228", month = jun # "~28-30, ", year = "2006", address = "Cairns, Australia", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", doi = "http://doi.acm.org/10.1145/1183401.1183433", isbn = "1-59593-282-8", url = "http://www.christian-engelmann.info/publications/varma06scalable.pdf", url2 = "http://www.christian-engelmann.info/publications/varma06scalable.ppt.pdf", abstract = "Reliability is increasingly becoming a challenge for high-performance computing (HPC) systems with thousands of nodes, such as IBM`s Blue Gene/L. A shorter mean-time-to-failure can be addressed by adding fault tolerance to reconfigure working nodes to ensure that communication and computation can progress. However, existing approaches fall short in providing scalability and small reconfiguration overhead within the fault-tolerant layer. This paper contributes a scalable approach to reconfigure the communication infrastructure after node failures. We propose a decentralized (peer-to-peer) protocol that maintains a consistent view of active nodes in the presence of faults. Our protocol shows response times in the order of hundreds of microseconds and single-digit milliseconds for reconfiguration using MPI over Blue Gene/L and TCP over Gigabit, respectively. The protocol can be adapted to match the network topology to further increase performance. We also verify experimental results against a performance model, which demonstrates the scalability of the approach. Hence, the membership service is suitable for deployment in the communication layer of MPI runtime systems, and we have integrated an early version into LAM/MPI.", note = "Acceptance rate 26.2\% (37/141)" }
@conference{okunbor06exploring, author = "Daniel I. Okunbor and Christian Engelmann and Stephen L. Scott", title = "Exploring Process Groups for Reliability, Availability and Serviceability of Terascale Computing Systems", booktitle = "Proceedings of the \href{http://www.atiner.gr/docs/2006AAAPROGRAM_COMP.htm} {$2^{nd}$ International Conference on Computer Science and Information Systems 2006}", month = jun # "~19-21, ", year = "2006", address = "Athens, Greece", url = "http://www.christian-engelmann.info/publications/okunbor06exploring.pdf", abstract = "This paper presents various aspects of reliability, availability and serviceability (RAS) systems as they relate to group communication service, including reliable and total order multicast/broadcast, virtual synchrony, and failure detection. While the issue of availability, particularly high availability using replication-based architectures has recently received upsurge research interests, much still have to be done in understanding the basic underlying concepts for achieving RAS systems, especially in high-end and high performance computing (HPC) communities. Various attributes of group communication service and the prototype of symmetric active replication following ideas utilized in the Newtop protocol will be discussed. We explore the application of group communication service for RAS HPC, laying the groundwork for its integrated model." }
@conference{limaye05jobsite, author = "Kshitij Limaye and Chokchai (Box) Leangsuksun and Zeno Greenwood and Stephen L. Scott and Christian Engelmann and Richard M. Libby and Kasidit Chanchio", title = "Job-Site Level Fault Tolerance for Cluster and {Grid} Environments", booktitle = "Proceedings of the \href{http://cluster2005.org}{$7^{th}$ IEEE International Conference on Cluster Computing (Cluster) 2005}", pages = "1--9", month = sep # "~26-30, ", year = "2005", address = "Boston, MA, USA", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "0-7803-9486-0", issn = "1552-5244", doi = "http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2005.347043", url = "http://www.christian-engelmann.info/publications/limaye05job-site.pdf", abstract = "In order to adopt high performance clusters and Grid computing for mission critical applications, fault tolerance is a necessity. Common fault tolerance techniques in distributed systems are normally achieved with checkpoint-recovery and job replication on alternative resources, in cases of a system outage. The first approach depends on the system`s MTTR while the latter approach depends on the availability of alternative sites to run replicas. There is a need for complementing these approaches by proactively handling failures at a job-site level, ensuring the system high availability with no loss of user submitted jobs. This paper discusses a novel fault tolerance technique that enables the job-site recovery in Beowulf cluster-based grid environments, whereas existing techniques give up a failed system by seeking alternative resources. Our results suggest sizable aggregate performance improvement during an implementation of our method in Globus-enabled HA-OSCAR. The technique called Smart Failover provides a transparent and graceful recovery mechanism that saves job states in a local job-manager queue and transfers those states to the backup server periodically, and in critical system events. Thus whenever a failover occurs, the backup server is able to restart the jobs from their last saved state.", note = "Acceptance rate 39.6\% (45/138)" }
@conference{song05umlbased, author = "Hertong Song and Chokchai (Box) Leangsuksun and Raja Nassar and Yudan Liu and Christian Engelmann and Stephen L. Scott", title = "{UML-based} {Beowulf} Cluster Availability Modeling", booktitle = "\href{http://www.world-academy-of-science.org/IMCSE2005/ws/SERP} {International Conference on Software Engineering Research and Practice (SERP) 2005}", pages = "161--167", month = jun # "~27-30, ", year = "2005", address = "Las Vegas, NV, USA", publisher = "CSREA Press", isbn = "1-932415-49-1" }
@conference{engelmann05superscalable, author = "Christian Engelmann and George A. (Al) Geist", title = "Super-Scalable Algorithms for Computing on 100,000 Processors", booktitle = "Lecture Notes in Computer Science: Proceedings of the \href{http://www.iccs-meeting.org/iccs2005}{$5^{th}$ International Conference on Computational Science (ICCS) 2005}, Part I", volume = "3514", pages = "313--320", month = may # "~22-25, ", year = "2005", address = "Atlanta, GA, USA", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "978-3-540-26032-5", issn = "0302-9743", doi = "http://dx.doi.org/10.1007/11428831_39", url = "http://www.christian-engelmann.info/publications/engelmann05superscalable.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann05superscalable.ppt.pdf", abstract = "In the next five years, the number of processors in high-end systems for scientific computing is expected to rise to tens and even hundreds of thousands. For example, the IBM Blue Gene/L can have up to 128,000 processors and the delivery of the first system is scheduled for 2005. Existing deficiencies in scalability and fault-tolerance of scientific applications need to be addressed soon. If the number of processors grows by a magnitude and efficiency drops by a magnitude, the overall effective computing performance stays the same. Furthermore, the mean time to interrupt of high-end computer systems decreases with scale and complexity. In a 100,000-processor system, failures may occur every couple of minutes and traditional checkpointing may no longer be feasible. With this paper, we summarize our recent research in super-scalable algorithms for computing on 100,000 processors. We introduce the algorithm properties of scale invariance and natural fault tolerance, and discuss how they can be applied to two different classes of algorithms. We also describe a super-scalable diskless checkpointing algorithm for problems that can`t be transformed into a super-scalable variant, or where other solutions are more efficient. Finally, a 100,000-processor simulator is presented as a platform for testing and experimentation.", note = "Acceptance rate 35\%" }
@conference{engelmann13toward, author = "Christian Engelmann and Thomas Naughton", title = "Toward a Performance/Resilience Tool for Hardware/Software Co-Design of High-Performance Computing Systems", booktitle = "Proceedings of the \href{http://icpp2013.ens-lyon.fr}{$42^{nd}$ International Conference on Parallel Processing (ICPP) 2013}: \href{http://www.psti-workshop.org} {$4^{th}$ International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI)}", pages = "", month = oct # "~2, ", year = "2013", address = "Lyon, France", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "", issn = "", doi = "", url = "", url2 = "", abstract = "xSim is a simulation-based performance investigation toolkit that permits running high-performance computing (HPC) applications in a controlled environment with millions of concurrent execution threads, while observing application performance in a simulated extreme-scale system for hardware/software co-design. The presented work details newly developed features for xSim that permit the injection of MPI process failures, the propagation/detection/notification of such failures within the simulation, and their handling using application-level checkpoint/restart. These new capabilities enable the observation of application behavior and performance under failure within a simulated future-generation HPC system using the most common fault handling technique.", note = "To appear" }
@conference{jones11simulation, author = "Ian S. Jones and Christian Engelmann", title = "Simulation of Large-Scale {HPC} Architectures", booktitle = "Proceedings of the \href{http://icpp2011.org}{$40^{th}$ International Conference on Parallel Processing (ICPP) 2011}: \href{http://www.psti-workshop.org} {$2^{nd}$ International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI)}", pages = "447-456", month = sep # "~13-19, ", year = "2011", address = "Taipei, Taiwan", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-4511-0", issn = "1530-2016", doi = "http://dx.doi.org/10.1109/ICPPW.2011.44", url = "http://www.christian-engelmann.info/publications/jones11simulation.pdf", url2 = "http://www.christian-engelmann.info/publications/jones11simulation.ppt.pdf", abstract = "The Extreme-scale Simulator (xSim) is a recently developed performance investigation toolkit that permits running high-performance computing (HPC) applications in a controlled environment with millions of concurrent execution threads. It allows observing parallel application performance properties in a simulated extreme-scale HPC system to further assist in HPC hardware and application software co-design on the road toward multi-petascale and exascale computing. This paper presents a newly implemented network model for the xSim performance investigation toolkit that is capable of providing simulation support for a variety of HPC network architectures with the appropriate trade-off between simulation scalability and accuracy. The taken approach focuses on a scalable distributed solution with latency and bandwidth restrictions for the simulated network. Different network architectures, such as star, ring, mesh, torus, twisted torus and tree, as well as hierarchical combinations, such as to simulate network-on-chip and network-on-node, are supported. Network traffic congestion modeling is omitted to gain simulation scalability by reducing simulation accuracy." }
@conference{fiala11tunable, author = "David Fiala and Kurt Ferreira and Frank Mueller and Christian Engelmann", title = "A Tunable, Software-based {DRAM} Error Detection and Correction Library for {HPC}", booktitle = "Lecture Notes in Computer Science: Proceedings of the \href{http://europar2011.bordeaux.inria.fr/}{$17^{th}$ European Conference on Parallel and Distributed Computing (Euro-Par) 2011 Workshops, Part II}: \href{http://xcr.cenit.latech.edu/resilience2011}{$4^{th}$ Workshop on Resiliency in High Performance Computing (Resilience) in Clusters, Clouds, and Grids}", volume = "7156", pages = "251-261", month = aug # "~29 - " # sep # "~2, ", year = "2011", address = "Bordeaux, France", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "978-3-642-29740-3", doi = "http://dx.doi.org/10.1007/978-3-642-29740-3_29", url = "http://www.christian-engelmann.info/publications/fiala11tunable.pdf", url2 = "", abstract = "Proposed exascale systems will present a number of considerable resiliency challenges. In particular, DRAM soft-errors, or bit-flips, are expected to greatly increase due to the increased memory density of these systems. Current hardware-based fault-tolerance methods will be unsuitable for addressing the expected soft error frequency rate. As a result, additional software will be needed to address this challenge. In this paper we introduce LIBSDC, a tunable, transparent silent data corruption detection and correction library for HPC applications. LIBSDC provides comprehensive SDC protection for program memory by implementing on-demand page integrity verification. Experimental benchmarks with Mantevo HPCCG show that once tuned, LIBSDC is able to achieve SDC protection with 50\% overhead of resources, less than the 100\% needed for double modular redundancy.", note = "Acceptance rate 60.0\% (12/20)" }
@conference{naughton11case, author = "Thomas Naughton and Geoffroy R. Vall\'ee and Christian Engelmann and Stephen L. Scott", title = "A Case for Virtual Machine based Fault Injection in a High-Performance Computing Environment", booktitle = "Lecture Notes in Computer Science: Proceedings of the \href{http://europar2011.bordeaux.inria.fr/}{$17^{th}$ European Conference on Parallel and Distributed Computing (Euro-Par) 2011}: \href{http://www.csm.ornl.gov/srt/conferences/hpcvirt2011} {$5^{th}$ Workshop on System-level Virtualization for High Performance Computing (HPCVirt)}", volume = "7155", pages = "234-243", month = aug # "~29 - " # sep # "~2, ", year = "2011", address = "Bordeaux, France", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "978-3-642-29737", doi = "http://dx.doi.org/10.1007/978-3-642-29737-3_27", url = "http://www.christian-engelmann.info/publications/naughton11case.pdf", url2 = "http://www.christian-engelmann.info/publications/naughton11case.ppt.pdf", abstract = "Large-scale computing platforms provide tremendous capabilities for scientific discovery. These systems have hundreds of thousands of computing cores, hundreds of terabytes of memory, and enormous high-performance interconnection networks. These systems are facing enormous challenges to achieve performance at such scale. Failures are an Achilles heel of these enormous systems. As applications and system software scale up to multi-petaflop and beyond to exascale platforms, the occurrence of failure will be much more common. This has given rise to a push in fault-tolerance and resilience research for HPC systems. This includes work on log analysis to identify types of failures, enhancements to the Message Passing Interface (MPI) to incorporate fault awareness, and a variety of fault tolerance mechanisms that span redundant computation, algorithm based fault tolerance, and advanced checkpoint/ restart techniques. While there is much work to be done on the FT/Resilience mechanisms for such large-scale systems, there is also a profound gap in the tools for experimentation. This gap is compounded by the fact that HPC environments have stringent performance requirements and are often highly customized. The tool chain for these systems are often tailored for the platform and while the majority of systems on the Top500 Supercomputer list run Linux, these operating environments typically contain many site/machine specific enhancements. Therefore, it is desirable to maintain a consistent execution environment to minimize end-user (scientist) interruption. The work on system-level virtualization for HPC system offers a unique opportunity to maintain a consistent execution environment via a virtual machine (VM). Recent work on virtualization for HPC has shown that low-overhead, high performance systems can be realized [1, 2] Virtualization also provides a clean abstraction for building experimental tools for investigation into the effects of failures in HPC and the related research on FT/ Resilience mechanisms and policies. In this paper we discuss the motivation for tools to perform fault injection in an HPC context, and outline an approach that can leverage virtualization." }
@conference{engelmann10facilitating, author = "Christian Engelmann and Frank Lauer", title = "Facilitating Co-Design for Extreme-Scale Systems Through Lightweight Simulation", booktitle = "Proceedings of the \href{http://www.cluster2010.org}{$12^{th}$ IEEE International Conference on Cluster Computing (Cluster) 2010}: \href{http://www2.wmin.ac.uk/getovv/aacec10.html} {$1^{st}$ Workshop on Application/Architecture Co-design for Extreme-scale Computing (AACEC)}", pages = "1-8", month = sep # "~20-24, ", year = "2010", address = "Hersonissos, Crete, Greece", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-1-4244-8395-2", doi = "http://dx.doi.org/10.1109/CLUSTERWKSP.2010.5613113", url = "http://www.christian-engelmann.info/publications/engelmann10facilitating.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann10facilitating.ppt.pdf", abstract = "This work focuses on tools for investigating algorithm performance at extreme scale with millions of concurrent threads and for evaluating the impact of future architecture choices to facilitate the co-design of high-performance computing (HPC) architectures and applications. The approach focuses on lightweight simulation of extreme-scale HPC systems with the needed amount of accuracy. The prototype presented in this paper is able to provide this capability using a parallel discrete event simulation (PDES), such that a Message Passing Interface (MPI) application can be executed at extreme scale, and its performance properties can be evaluated. The results of an initial prototype are encouraging as a simple hello world MPI program could be scaled up to 1,048,576 virtual MPI processes on a four-node cluster, and the performance properties of two MPI programs could be evaluated at up to 1,024 and 16,384 virtual MPI processes on the same system." }
@conference{ostrouchov09nonparametric, author = "George Ostrouchov and Thomas Naughton and Christian Engelmann and Geoffroy R. Vall\'ee and Stephen L. Scott", title = "Nonparametric Multivariate Anomaly Analysis in Support of {HPC} Resilience", booktitle = "Proceedings of the \href{http://www.oerc.ox.ac.uk/ieee} {$5^{th}$ IEEE International Conference on e-Science (e-Science) 2009}: \href{http://www.oerc.ox.ac.uk/ieee/workshops/workshops/computational-science} {Workshop on Computational Science}", pages = "80-85", month = dec # "~9-11, ", year = "2009", address = "Oxford, UK", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-1-4244-5946-9", doi = "http://dx.doi.org/10.1109/ESCIW.2009.5407992", url = "http://www.christian-engelmann.info/publications/ostrouchov09nonparametric.pdf", url2 = "http://www.christian-engelmann.info/publications/ostrouchov09nonparametric.ppt.pdf", abstract = "Large-scale computing systems provide great potential for scientific exploration. However, the complexity that accompanies these enormous machines raises challeges for both, users and operators. The effective use of such systems is often hampered by failures encountered when running applications on systems containing tens-of-thousands of nodes and hundreds-of-thousands of compute cores capable of yielding petaflops of performance. In systems of this size failure detection is complicated and root-cause diagnosis difficult. This paper describes our recent work in the identification of anomalies in monitoring data and system logs to provide further insights into machine status, runtime behavior, failure modes and failure root causes. It discusses the details of an initial prototype that gathers the data and uses statistical techniques for analysis." }
@conference{naughton09fault, author = "Thomas Naughton and Wesley Bland and Geoffroy R. Vall\'ee and Christian Engelmann and Stephen L. Scott", title = "Fault Injection Framework for System Resilience Evaluation -- {F}ake Faults for Finding Future Failures", booktitle = "Proceedings of the \href{http://www.lrz-muenchen.de/hpdc2009}{$18^{th}$ International Symposium on High Performance Distributed Computing (HPDC) 2009}: \href{http://xcr.cenit.latech.edu/resilience2009}{$2^{nd}$ Workshop on Resiliency in High Performance Computing (Resilience) 2009}", pages = "23--28", month = jun # "~9, ", year = "2009", address = "Munich, Germany", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", isbn = "978-1-60558-587-1", doi = "http://doi.acm.org/10.1145/1552526.1552530", url = "http://www.christian-engelmann.info/publications/naughton09fault.pdf", url2 = "http://www.christian-engelmann.info/publications/naughton09fault.ppt.pdf", abstract = "As high-performance computing (HPC) systems increase in size and complexity they become more difficult to manage. The enormous component counts associated with these large systems lead to significant challenges in system reliability and availability. This in turn is driving research into the resilience of large scale systems, which seeks to curb the effects of increased failures at large scales by masking the inevitable faults in these systems. The basic premise being that failure must be accepted as a reality of large scale system and coped with accordingly through system resilience. A key component in the development and evaluation of system resilience techniques is having a means to conduct controlled experiments. A common method for performing such experiments is to generate synthetic faults and study the resulting effects. In this paper we discuss the motivation and our initial use of software fault injection to support the evaluation of resilience for HPC systems. We mention background and related work in the area and discuss the design of a tool to aid in fault injection experiments for both user-space (application-level) and system-level failures." }
@conference{tikotekar09performance, author = "Anand Tikotekar and Hong H. Ong and Sadaf Alam and Geoffroy R. Vall\'ee and Thomas Naughton and Christian Engelmann and Stephen L. Scott", title = "Performance Comparison of Two Virtual Machine Scenarios Using an {HPC} Application -- {A} Case study Using Molecular Dynamics Simulations", booktitle = "Proceedings of the \href{http://www.csm.ornl.gov/srt/hpcvirt09}{$3^{rd}$ Workshop on System-level Virtualization for High Performance Computing (HPCVirt) 2009}, in conjunction with the \href{http://www.eurosys.org/2009}{$4^{th}$ ACM SIGOPS European Conference on Computer Systems (EuroSys) 2009}", pages = "33--40", month = mar # "~30, ", year = "2009", address = "Nuremberg, Germany", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", isbn = "978-1-60558-465-2", doi = "http://doi.acm.org/10.1145/1519138.1519143", url = "http://www.christian-engelmann.info/publications/tikotekar09performance.pdf", url2 = "http://www.christian-engelmann.info/publications/tikotekar09performance.ppt.pdf", abstract = "Obtaining high flexibility to performance-loss ratio is a key challenge of today's HPC virtual environment landscape. And while extensive research has been targeted at extracting more performance from virtual machines, the idea that whether novel virtual machine usage scenarios could lead to high flexibility Vs performance trade-off has received less attention. We, in this paper, take a step forward by studying and comparing the performance implications of running the Large-scale Atomic/Molecular Massively Parallel Simulator (LAMMPS) application on two virtual machine configurations. First configuration consists of two virtual machines per node with 1 application process per virtual machine. The second configuration consists of 1 virtual machine per node with 2 processes per virtual machine. Xen has been used as an hypervisor and standard Linux as a guest virtual machine. Our results show that the difference in overall performance impact on LAMMPS between the two virtual machine configurations described above is around 3\%. We also study the difference in performance impact in terms of each configuration's individual metrics such as CPU, I/O, Memory, and interrupt/context switches." }
@conference{vallee08virtual, author = "Geoffroy R. Vall\'ee and Thomas Naughton and Hong H. Ong and Anand Tikotekar and Christian Engelmann and Wesley Bland and Ferrol Aderholt and Stephen L. Scott", title = "Virtual System Environments", booktitle = "Communications in Computer and Information Science: Proceedings of the \href{http://www.dmtf.org/svm08}{$2^{nd}$ DMTF Academic Alliance Workshop on Systems and Virtualization Management: Standards and New Technologies (SVM) 2008}", volume = "18", pages = "72--83", month = oct # "~21-22, ", year = "2008", address = "Munich, Germany", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "978-3-540-88707-2", issn = "1865-0929", doi = "http://dx.doi.org/10.1007/978-3-540-88708-9_7", url = "http://www.christian-engelmann.info/publications/vallee08virtual.pdf", url2 = "", abstract = "Distributed and parallel systems are typically managed with static settings: the operating system (OS) and the runtime environment (RTE) are specified at a given time and cannot be changed to fit an application`s needs. This means that every time application developers want to use their application on a new execution platform, the application has to be ported to this new environment, which may be expensive in terms of application modifications and developer time. However, the science resides in the applications and not in the OS or the RTE. Therefore, it should be beneficial to adapt the OS and the RTE to the application instead of adapting the applications to the OS and the RTE. This document presents the concept of Virtual System Environments (VSE), which enables application developers to specify and create a virtual environment that properly fits their application`s needs. For that four challenges have to be addressed: (i) definition of the VSE itself by the application developers, (ii) deployment of the VSE, (iii) system administration for the platform, and (iv) protection of the platform from the running VSE. We therefore present an integrated tool for the definition and deployment of VSEs on top of traditional and virtual (i.e., using system-level virtualization) execution platforms. This tool provides the capability to choose the degree of delegation for system administration tasks and the degree of protection from the application (e.g., using virtual machines). To summarize, the VSE concept enables the customization of the OS/RTE used for the execution of application by users without compromising local system administration rules and execution platform protection constraints." }
@conference{tikotekar08analysis, author = "Anand Tikotekar and Geoffroy Vall\'ee and Thomas Naughton and Hong H. Ong and Christian Engelmann and Stephen L. Scott", title = "An Analysis of {HPC} Benchmark Applications in Virtual Machine Environments", booktitle = "Lecture Notes in Computer Science: Proceedings of the \href{http://europar2008.caos.uab.es}{$14^{th}$ European Conference on Parallel and Distributed Computing (Euro-Par) 2008}: \href{http://scilytics.com/vhpc}{$3^{rd}$ Workshop on Virtualization in High-Performance Cluster and Grid Computing (VHPC) 2008}", volume = "5415", pages = "63--71", month = aug # "~26-29, ", year = "2008", address = "Las Palmas de Gran Canaria, Spain", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "978-3-642-00954-9", doi = "http://dx.doi.org/10.1007/978-3-642-00955-6", url = "http://www.christian-engelmann.info/publications/tikotekar08analysis.pdf", url2 = "http://www.christian-engelmann.info/publications/tikotekar08analysis.ppt.pdf", abstract = "Virtualization technology has been gaining acceptance in the scientific community due to its overall flexibility in running HPC applications. It has been reported that a specific class of applications is better suited to a particular type of virtualization scheme or implementation. For example, Xen has been shown to perform with little overhead for compute-bound applications. Such a study, although useful, does not allow us to generalize conclusions beyond the performance analysis of that application which is explicitly executed. An explanation of why the generalization described above is difficult, may be due to the versatility in applications, which leads to different overheads in virtual environments. For example, two similar applications may spend disproportionate amount of time in their respective library code when run in virtual environments. In this paper, we aim to study such potential causes by investigating the behavior and identifying patterns of various overheads for HPC benchmark applications. Based on the investigation of the overhead profiles for different benchmarks, we aim to address questions such as: Are the overhead profiles for a particular type of benchmarks (such as compute-bound) similar or are there grounds to conclude otherwise?" }
@conference{engelmann08symmetric2, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Symmetric Active/Active High Availability for High-Performance Computing System Services: Accomplishments and Limitations", booktitle = "Proceedings of the \href{http://www.ens-lyon.fr/LIP/RESO/ccgrid2008}{$8^{th}$ IEEE International Symposium on Cluster Computing and the Grid (CCGrid) 2008}: \href{http://xcr.cenit.latech.edu/resilience2008}{Workshop on Resiliency in High Performance Computing (Resilience) 2008}", pages = "813--818", month = may # "~19-22, ", year = "2008", address = "Lyon, France", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "978-0-7695-3156-4", doi = "http://doi.ieeecomputersociety.org/10.1109/CCGRID.2008.78", url = "http://www.christian-engelmann.info/publications/engelmann08symmetric2.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann08symmetric2.pdf", abstract = "This paper summarizes our efforts over the last 3-4 years in providing symmetric active/active high availability for high-performance computing (HPC) system services. This work paves the way for high-level reliability, availability and serviceability in extreme-scale HPC systems by focusing on the most critical components, head and service nodes, and by reinforcing them with appropriate high availability solutions. This paper presents our accomplishments in the form of concepts and respective prototypes, discusses existing limitations, outlines possible future work, and describes the relevance of this research to other, planned efforts." }
@conference{chen08online, author = "Xin Chen and Benjamin Eckart and Xubin (Ben) He and Christian Engelmann and Stephen L. Scott", title = "An Online Controller Towards Self-Adaptive File System Availability and Performance", booktitle = "Proceedings of the \href{http://xcr.cenit.latech.edu/hapcw2008}{$5^{th}$ High Availability and Performance Workshop (HAPCW) 2008}, in conjunction with the \href{http://www.hpcsw.org}{$1^{st}$ High-Performance Computer Science Week (HPCSW) 2008}", month = apr # "~3-4, ", year = "2008", address = "Denver, CO, USA", url = "http://www.christian-engelmann.info/publications/chen08online.pdf", url2 = "http://www.christian-engelmann.info/publications/chen08online.ppt.pdf", abstract = "At the present time, it can be a significant challenge to build a large-scale distributed file system that simultaneously maintains both high availability and high performance. Although many fault tolerance technologies have been proposed and used in both commercial and academic distributed file systems to achieve high availability, most of them typically sacrifice performance for higher system availability. Additionally, recent studies show that system availability and performance are related to the system workload. In this paper, we analyze the correlations among availability, performance, and workloads based on a replication strategy, and we discuss the trade off between availability and performance with different workloads. Our analysis leads to the design of an online controller that can dynamically achieve optimal performance and availability by tuning the system replication policy." }
@conference{tikotekar08effects, author = "Anand Tikotekar and Geoffroy Vall\'ee and Thomas Naughton and Hong H. Ong and Christian Engelmann and Stephen L. Scott and Anthony M. Filippi", title = "Effects of Virtualization on a Scientific Application -- {R}unning a Hyperspectral Radiative Transfer Code on Virtual Machines", booktitle = "Proceedings of the \href{http://www.csm.ornl.gov/srt/hpcvirt08}{$2^{nd}$ Workshop on System-level Virtualization for High Performance Computing (HPCVirt) 2008}, in conjunction with the \href{http://www.eurosys.org/2008}{$3^{rd}$ ACM SIGOPS European Conference on Computer Systems (EuroSys) 2008}", pages = "16--23", month = mar # "~31, ", year = "2008", address = "Glasgow, UK", publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}", isbn = "978-1-60558-120-0", doi = "http://doi.acm.org/10.1145/1435452.1435455", url = "http://www.christian-engelmann.info/publications/tikotekar08effects.pdf", url2 = "http://www.christian-engelmann.info/publications/tikotekar08effects.ppt.pdf", abstract = "The topic of system-level virtualization has recently begun to receive interest for high performance computing (HPC). This is in part due to the isolation and encapsulation offered by the virtual machine. These traits enable applications to customize their environments and maintain consistent software configurations in their virtual domains. Additionally, there are mechanisms that can be used for fault tolerance like live virtual machine migration. Given these attractive benefits to virtualization, a fundamental question arises, how does this effect my scientific application? We use this as the premise for our paper and observe a real-world scientific code running on a Xen virtual machine. We studied the effects of running a radiative transfer simulation, Hydrolight, on a virtual machine. We discuss our methodology and report observations regarding the usage of virtualization with this application." }
@conference{engelmann07middleware, author = "Christian Engelmann and Hong H. Ong and Stephen L. Scott", title = "Middleware in Modern High Performance Computing System Architectures", booktitle = "Lecture Notes in Computer Science: Proceedings of the \href{http://www.iccs-meeting.org/iccs2007}{$7^{th}$ International Conference on Computational Science (ICCS) 2007}, Part II: \href{http://www.gup.uni-linz.ac.at/cce2007} {$4^{th}$ Special Session on Collaborative and Cooperative Environments (CCE) 2007}", volume = "4488", pages = "784--791", month = may # "~27-30, ", year = "2007", address = "Beijing, China", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "3-5407-2585-5", issn = "0302-9743", doi = "http://dx.doi.org/10.1007/978-3-540-72586-2_111", url = "http://www.christian-engelmann.info/publications/engelmann07middleware.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann07middleware.ppt.pdf", abstract = "A recent trend in modern high performance computing (HPC) system architectures employs lean compute nodes running a lightweight operating system (OS). Certain parts of the OS a well as other system software services are moved to service nodes in order to increase performance and scalability. This paper examines the impact of this HPC system architecture trend on HPC middleware software solutions, which traditionally equip HPC systems with advanced features, such as parallel and distributed programming models, appropriate system resource management mechanisms, remote application steering and user interaction techniques. Since the approach of keeping the compute node software stack small and simple is orthogonal to the middleware concept of adding missing OS features between OS and application, the role and architecture of middleware in modern HPC systems needs to be revisited. The result is a paradigm shift in HPC middleware design, where single middleware services are moved to service nodes, while runtime environments (RTEs) continue to reside on compute nodes." }
@conference{engelmann07transparent, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Transparent Symmetric Active/Active Replication for Service-Level High Availability", booktitle = "Proceedings of the \href{http://ccgrid07.lncc.br}{$7^{th}$ IEEE International Symposium on Cluster Computing and the Grid (CCGrid) 2007}: \href{http://www.lri.fr/~fedak/gp2pc-07} {$7^{th}$ International Workshop on Global and Peer-to-Peer Computing (GP2PC) 2007}", pages = "755--760", month = may # "~14-17, ", year = "2007", address = "Rio de Janeiro, Brazil", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "0-7695-2833-3", doi = "http://doi.ieeecomputersociety.org/10.1109/CCGRID.2007.116", url = "http://www.christian-engelmann.info/publications/engelmann07transparent.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann07transparent.ppt.pdf", abstract = "As service-oriented architectures become more important in parallel and distributed computing systems, individual service instance reliability as well as appropriate service redundancy becomes an essential necessity in order to increase overall system availability. This paper focuses on providing redundancy strategies using service-level replication techniques. Based on previous research using symmetric active/active replication, this paper proposes a transparent symmetric active/active replication approach that allows for more reuse of code between individual service-level replication implementations by using a virtual communication layer. Service- and client-side interceptors are utilized in order to provide total transparency. Clients and servers are unaware of the replication infrastructure as it provides all necessary mechanisms internally." }
@conference{engelmann07configurable, author = "Christian Engelmann and Stephen L. Scott and Hong H. Ong and Geoffroy R. Vall\'ee and Thomas Naughton", title = "Configurable Virtualized System Environments for High Performance Computing", booktitle = "Proceedings of the \href{http://www.csm.ornl.gov/srt/hpcvirt07}{$1^{st}$ Workshop on System-level Virtualization for High Performance Computing (HPCVirt) 2007}, in conjunction with the \href{http://www.eurosys.org/2008}{$2^{nd}$ ACM SIGOPS European Conference on Computer Systems (EuroSys) 2007}", month = mar # "~20, ", year = "2007", address = "Lisbon, Portugal", url = "http://www.christian-engelmann.info/publications/engelmann07configurable.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann07configurable.ppt.pdf", abstract = "Existing challenges for current terascale high performance computing (HPC) systems are increasingly hampering the development and deployment efforts of system software and scientific applications for next-generation petascale systems. The expected rapid system upgrade interval toward petascale scientific computing demands an incremental strategy for the development and deployment of legacy and new large-scale scientific applications that avoids excessive porting. Furthermore, system software developers as well as scientific application developers require access to large-scale testbed environments in order to test individual solutions at scale. This paper proposes to address these issues at the system software level through the development of a virtualized system environment (VSE) for scientific computing. The proposed VSE approach enables plug-and-play supercomputing through desktop-to-cluster-to-petaflop computer system-level virtualization based on recent advances in hypervisor virtualization technologies. This paper describes the VSE system architecture in detail, discusses needed tools for VSE system management and configuration, and presents respective VSE use case scenarios." }
@conference{engelmann06towards, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Towards High Availability for High-Performance Computing System Services: {A}ccomplishments and Limitations", booktitle = "Proceedings of the \href{http://xcr.cenit.latech.edu/hapcw2006}{$4^{th}$ High Availability and Performance Workshop (HAPCW) 2006}, in conjunction with the \href{http://lacsi.krellinst.org} {$7^{th}$ Los Alamos Computer Science Institute (LACSI) Symposium 2006}", month = oct # "~17, ", year = "2006", address = "Santa Fe, NM, USA", url = "http://www.christian-engelmann.info/publications/engelmann06towards.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann06towards.ppt.pdf", abstract = "During the last several years, our teams at Oak Ridge National Laboratory, Louisiana Tech University, and Tennessee Technological University focused on efficient redundancy strategies for head and service nodes of high-performance computing (HPC) systems in order to pave the way for high availability (HA) in HPC. These nodes typically run critical HPC system services, like job and resource management, and represent single points of failure and control for an entire HPC system. The overarching goal of our research is to provide high-level reliability, availability, and serviceability (RAS) for HPC systems by combining HA and HPC technology. This paper summarizes our accomplishments, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." }
@conference{ou06achieving, author = "Li Ou and Xin Chen and Xubin (Ben) He and Christian Engelmann and Stephen L. Scott", title = "Achieving Computational {I/O} Effciency in a High Performance Cluster Using Multicore Processors", booktitle = "Proceedings of the \href{http://xcr.cenit.latech.edu/hapcw2006}{$4^{th}$ High Availability and Performance Workshop (HAPCW) 2006}, in conjunction with the \href{http://lacsi.krellinst.org} {$7^{th}$ Los Alamos Computer Science Institute (LACSI) Symposium 2006}", month = oct # "~17, ", year = "2006", address = "Santa Fe, NM, USA", url = "http://www.christian-engelmann.info/publications/ou06achieving.pdf", url2 = "http://www.christian-engelmann.info/publications/ou06achieving.ppt.pdf", abstract = "Cluster computing has become one of the most popular platforms for high-performance computing today. The recent popularity of multicore processors provides a flexible way to increase the computational capability of clusters. Although the system performance may improve with multicore processors in a cluster, I/O requests initiated by multiple cores may saturate the I/O bus, and furthermore increase the latency by issuing multiple non-contiguous disk accesses. In this paper, we propose an asymmetric collective I/O for multicore processors to improve multiple non-contiguous accesses. In our configuration, one core in each multicore processor is designated as the coordinator, and others serve as computing cores. The coordinator is responsible for aggregating I/O operations from computing cores and submitting a contiguous request. The coordinator allocates contiguous memory buffers on behalf of other cores to avoid redundant data copies." }
@conference{engelmann06rmix, author = "Christian Engelmann and George A. (Al) Geist", title = "{RMIX}: {A} Dynamic, Heterogeneous, Reconfigurable Communication Framework", booktitle = "Lecture Notes in Computer Science: Proceedings of the \href{http://www.iccs-meeting.org/iccs2006}{$6^{th}$ International Conference on Computational Science (ICCS) 2006}, Part II: \href{http://www.gup.uni-linz.ac.at/cce2006} {$3^{rd}$ Special Session on Collaborative and Cooperative Environments (CCE) 2006}", volume = "3992", pages = "573--580", month = may # "~28-31, ", year = "2006", address = "Reading, UK", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "3-540-34381-4", issn = "0302-9743", doi = "http://dx.doi.org/10.1007/11758525_77", url = "http://www.christian-engelmann.info/publications/engelmann06rmix.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann06rmix.ppt.pdf", abstract = "RMIX is a dynamic, heterogeneous, reconfigurable communication framework that allows software components to communicate using various RMI/RPC protocols, such as ONC RPC, Java RMI and SOAP, by facilitating dynamically loadable provider plug-ins to supply different protocol stacks. With this paper, we present a native (C-based), flexible, adaptable, multi-protocol RMI/RPC communication framework that complements the Java-based RMIX variant previously developed by our partner team at Emory University. Our approach offers the same multi-protocol RMI/RPC services and advanced invocation semantics via a C-based interface that does not require an object-oriented programming language. This paper provides a detailed description of our RMIX framework architecture and some of its features. It describes the general use case of the RMIX framework and its integration into the Harness metacomputing environment in the form of a plug-in." }
@conference{engelmann06active, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Active/Active Replication for Highly Available {HPC} System Services", booktitle = "Proceedings of the \href{http://www.ares-conference.eu/ares2006}{$1^{st}$ International Conference on Availability, Reliability and Security (ARES) 2006}: $1^{st}$ International Workshop on Frontiers in Availability, Reliability and Security (FARES) 2006", pages = "639-645", month = apr # "~20-22, ", year = "2006", address = "Vienna, Austria", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "0-7695-2567-9", doi = "http://doi.ieeecomputersociety.org/10.1109/ARES.2006.23", url = "http://www.christian-engelmann.info/publications/engelmann06active.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann06active.ppt.pdf", abstract = "Today`s high performance computing systems have several reliability deficiencies resulting in availability and serviceability issues. Head and service nodes represent a single point of failure and control for an entire system as they render it inaccessible and unmanageable in case of a failure until repair, causing a significant downtime. This paper introduces two distinct replication methods (internal and external) for providing symmetric active/active high availability for multiple head and service nodes running in virtual synchrony. It presents a comparison of both methods in terms of expected correctness, ease-of-use and performance based on early results from ongoing work in providing symmetric active/active high availability for two HPC system services (TORQUE and PVFS metadata server). It continues with a short description of a distributed mutual exclusion algorithm and a brief statement regarding the handling of Byzantine failures. This paper concludes with an overview of past and ongoing work, and a short summary of the presented research." }
@conference{engelmann05concepts, author = "Christian Engelmann and Stephen L. Scott", title = "Concepts for High Availability in Scientific High-End Computing", booktitle = "Proceedings of the \href{http://xcr.cenit.latech.edu/hapcw2005}{$3^{rd}$ High Availability and Performance Workshop (HAPCW) 2005}, in conjunction with the \href{http://lacsi.rice.edu/symposium/agenda_2005}{$6^{th}$ Los Alamos Computer Science Institute (LACSI) Symposium 2005}", month = oct # "~11, ", year = "2005", address = "Santa Fe, NM, USA", url = "http://www.christian-engelmann.info/publications/engelmann05concepts.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann05concepts.ppt.pdf", abstract = "Scientific high-end computing (HEC) has become an important tool for scientists world-wide to understand problems, such as in nuclear fusion, human genomics and nanotechnology. Every year, new HEC systems emerge on the market with better performance and higher scale. With only very few exceptions, the overall availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically due to the recent trend towards capability computing. In this paper, we analyze the existing deficiencies of current HEC systems and present several high availability concepts to counter the experienced loss of availability and to alleviate the expected impact on next-generation systems. We explain the application of these concepts to current and future HEC systems and list past and ongoing related research. This paper closes with a short summary of the presented work and a brief discussion of future efforts." }
@conference{engelmann05high, author = "Christian Engelmann and Stephen L. Scott", title = "High Availability for Ultra-Scale High-End Scientific Computing", booktitle = "Proceedings of the \href{http://coset.irisa.fr}{$2^{nd}$ International Workshop on Operating Systems, Programming Environments and Management Tools for High-Performance Computing on Clusters (COSET-2) 2005}, in conjunction with the \href{http://ics05.csail.mit.edu}{$19^{th}$ ACM International Conference on Supercomputing (ICS) 2005}", month = jun # "~19, ", year = "2005", address = "Cambridge, MA, USA", url = "http://www.christian-engelmann.info/publications/engelmann05high.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann05high.ppt.pdf", abstract = "Ultra-scale architectures for scientific high-end computing with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, suffer from availability deficiencies, which impact the efficiency of running computational jobs by forcing frequent checkpointing of applications. Most systems are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services, such as the job scheduler or MPI, or even of the entire machine. In this paper, we present a flexible, pluggable and component-based high availability framework that expands today`s effort in high availability computing of keeping a single server alive to include all machines cooperating in a high-end scientific computing environment, while allowing adaptation to system properties and application needs." }
@conference{leangsuksun05asymmetric, author = "Chokchai (Box) Leangsuksun and Venkata K. Munganuru and Tong Liu and Stephen L. Scott and Christian Engelmann", title = "Asymmetric Active-Active High Availability for High-end Computing", booktitle = "Proceedings of the \href{http://coset.irisa.fr}{$2^{nd}$ International Workshop on Operating Systems, Programming Environments and Management Tools for High-Performance Computing on Clusters (COSET-2) 2005}, in conjunction with the \href{http://ics05.csail.mit.edu}{$19^{th}$ ACM International Conference on Supercomputing (ICS) 2005}", month = jun # "~19, ", year = "2005", address = "Cambridge, MA, USA", url = "http://www.christian-engelmann.info/publications/leangsuksun05asymmetric.pdf", url2 = "http://www.christian-engelmann.info/publications/leangsuksun05asymmetric.ppt.pdf", abstract = "Linux clusters have become very popular for scientific computing at research institutions world-wide, because they can be easily deployed at a fairly low cost. However, the most pressing issues of today`s cluster solutions are availability and serviceability. The conventional Beowulf cluster architecture has a single head node connected to a group of compute nodes. This head node is a typical single point of failure and control, which severely limits availability and serviceability by effectively cutting off healthy compute nodes from the outside world upon overload or failure. In this paper, we describe a paradigm that addresses this issue using asymmetric active-active high availability. Our framework comprises of n + 1 head nodes, where n head nodes are active in the sense that they provide services to simultaneously incoming user requests. One standby server monitors all active servers and performs a fail-over in case of a detected outage. We present a prototype implementation based on a 2 + 1 solution and discuss initial results." }
@conference{engelmann05lightweight, author = "Christian Engelmann and George A. (Al) Geist", title = "A Lightweight Kernel for the Harness Metacomputing Framework", booktitle = "Proceedings of the \href{http://www.ipdps.org/ipdps2005}{$19^{th}$ IEEE International Parallel and Distributed Processing Symposium (IPDPS) 2005}: \href{http://www.cs.umass.edu/~rsnbrg/hcw2005} {$14^{th}$ Heterogeneous Computing Workshop (HCW) 2005}", month = apr # "~4, ", year = "2005", address = "Denver, CO, USA", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "0-7695-2312-9", issn = "1530-2075", doi = "http://doi.ieeecomputersociety.org/10.1109/IPDPS.2005.34", url = "http://www.christian-engelmann.info/publications/engelmann05lightweight.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann05lightweight.ppt.pdf", abstract = "Harness is a pluggable heterogeneous Distributed Virtual Machine (DVM) environment for parallel and distributed scientific computing. This paper describes recent improvements in the Harness kernel design. By using a lightweight approach and moving previously integrated system services into software modules, the software becomes more versatile and adaptable. This paper outlines these changes and explains the major Harness kernel components in more detail. A short overview is given of ongoing efforts in integrating RMIX, a dynamic heterogeneous reconfigurable communication framework, into the Harness environment as a new plug-in software module. We describe the overall impact of these changes and how they relate to other ongoing work." }
@conference{engelmann04high, author = "Christian Engelmann and Stephen L. Scott and George A. (Al) Geist", title = "High Availability through Distributed Control", booktitle = "Proceedings of the \href{http://xcr.cenit.latech.edu/hapcw2004}{$2^{nd}$ High Availability and Performance Workshop (HAPCW) 2004}, in conjunction with the \href{http://lacsi.rice.edu/symposium/agenda_2004}{$5^{th}$ Los Alamos Computer Science Institute (LACSI) Symposium 2004}", month = oct # "~12, ", year = "2004", address = "Santa Fe, NM, USA", url = "http://www.christian-engelmann.info/publications/engelmann04high.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann04high.ppt.pdf", abstract = "Cost-effective, flexible and efficient scientific simulations in cutting-edge research areas utilize huge high-end computing resources with thousands of processors. In the next five to ten years the number of processors in such computer systems will rise to tens of thousands, while scientific application running times are expected to increase further beyond the Mean-Time-To-Interrupt (MTTI) of hardware and system software components. This paper describes the ongoing research in heterogeneous adaptable reconfigurable networked systems (Harness) and its recent achievements in the area of high availability distributed virtual machine environments for parallel and distributed scientific computing. It shows how a distributed control algorithm is able to steer a distributed virtual machine process in virtual synchrony while maintaining consistent replication for high availability. It briefly illustrates ongoing work in heterogeneous reconfigurable communication frameworks and security mechanisms. The paper continues with a short overview of similar research in reliable group communication frameworks, fault-tolerant process groups and highly available distributed virtual processes. It closes with a brief discussion of possible future research directions." }
@conference{he04highly, author = "Xubin (Ben) He and Li Ou and Stephen L. Scott and Christian Engelmann", title = "A Highly Available Cluster Storage System using Scavenging", booktitle = "Proceedings of the \href{http://xcr.cenit.latech.edu/hapcw2004}{$2^{nd}$ High Availability and Performance Workshop (HAPCW) 2004}, in conjunction with the \href{http://lacsi.rice.edu/symposium/agenda_2004}{$5^{th}$ Los Alamos Computer Science Institute (LACSI) Symposium 2004}", month = oct # "~12, ", year = "2004", address = "Santa Fe, NM, USA", url = "http://www.christian-engelmann.info/publications/he04highly.pdf", url2 = "http://www.christian-engelmann.info/publications/he04highly.ppt.pdf", abstract = "Highly available data storage for high-performance computing is becoming increasingly more critical as high-end computing systems scale up in size and storage systems are developed around network-centered architectures. A promising solution is to harness the collective storage potential of individual workstations much as we harness idle CPU cycles due to the excellent price/performance ratio and low storage usage of most commodity workstations. For such a storage system, metadata consistency is a key issue assuring storage system availability as well as data reliability. In this paper, we present a decentralized metadata management scheme that improves storage availability without sacrificing performance." }
@conference{engelmann03diskless, author = "Christian Engelmann and George A. (Al) Geist", title = "A Diskless Checkpointing Algorithm for Super-scale Architectures Applied to the Fast Fourier Transform", booktitle = "Proceedings of the \href{http://www.cs.msstate.edu/~clade2003}{Challenges of Large Applications in Distributed Environments Workshop (CLADE) 2003}, in conjunction with the \href{http://csag.ucsd.edu/HPDC-12}{$12^{th}$ IEEE International Symposium on High Performance Distributed Computing (HPDC) 2003}", pages = "47", month = jun # "~21, ", year = "2003", address = "Seattle, WA, USA", publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los Alamitos, CA, USA}", isbn = "0-7695-1984-9", doi = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4159902", url = "http://www.christian-engelmann.info/publications/engelmann03diskless.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann03diskless.ppt.pdf", abstract = "This paper discusses the issue of fault-tolerance in distributed computer systems with tens or hundreds of thousands of diskless processor units. Such systems, like the IBM Blue Gene/L, are predicted to be deployed in the next five to ten years. Since a 100,000-processor system is going to be less reliable, scientific applications need to be able to recover from occurring failures more efficiently. In this paper, we adapt the present technique of diskless checkpointing to such huge distributed systems in order to equip existing scientific algorithms with super-scalable fault-tolerance. First, we discuss the method of diskless checkpointing, then we adapt this technique to super-scale architectures and finally we present results from an implementation of the Fast Fourier Transform that uses the adapted technique to achieve super-scale fault-tolerance." }
@conference{engelmann02distributed, author = "Christian Engelmann and Stephen L. Scott and George A. (Al) Geist", title = "Distributed Peer-to-Peer Control in {Harness}", booktitle = "Lecture Notes in Computer Science: Proceedings of the \href{http://www.science.uva.nl/events/ICCS2002}{$2^{nd}$ International Conference on Computational Science (ICCS) 2002}, Part II: Workshop on Global and Collaborative Computing", volume = "2330", pages = "720--727", month = apr # "~21-24, ", year = "2002", address = "Amsterdam, The Netherlands", publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin, Germany}", isbn = "3-540-43593-X", issn = "0302-9743", doi = "http://www.springerlink.com/content/l537ujfwt8yta2dp", url = "http://www.christian-engelmann.info/publications/engelmann02distributed.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann02distributed.ppt.pdf", abstract = "Harness is an adaptable fault-tolerant virtual machine environment for next-generation heterogeneous distributed computing developed as a follow on to PVM. It additionally enables the assembly of applications from plug-ins and provides fault-tolerance. This work describes the distributed control, which manages global state replication to ensure a high-availability of service. Group communication services achieve an agreement on an initial global state and a linear history of global state changes at all members of the distributed virtual machine. This global state is replicated to all members to easily recover from single, multiple and cascaded faults. A peer-to-peer ring network architecture and tunable multi-point failure conditions provide heterogeneity and scalability. Finally, the integration of the distributed control into the multi-threaded kernel architecture of Harness offers a fault-tolerant global state database service for plug-ins and applications." }
@misc{fiala11detection, author = "David Fiala and Frank Mueller and Christian Engelmann and Rolf Riesen and Kurt Ferreira", title = "Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing", month = nov # "~12-18, ", year = "2011", howpublished = "{Poster at the \href{http://sc11.supercomputing.org} {24th IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2011}, Seattle, WA, USA}", url = "", abstract = "Faults have become the norm rather than the exception for high-end computing on clusters with 10s/100s of thousands of cores. Exacerbating this situation, some of these faults will not be detected, manifesting themselves as silent errors that will corrupt memory while applications continue to operate and report incorrect results. This poster introduces RedMPI, an MPI library which resides in the MPI profiling layer. RedMPI is capable of both online detection and correction of soft errors that occur in MPI applications without requiring any modifications to the application source. By providing redundancy, RedMPI is capable of transparently detecting corrupt messages from MPI processes that become faulted during execution. Furthermore, with triple redundancy RedMPI additionally ``votes'' out MPI messages of a faulted process by replacing corrupted results with corrected results from unfaulted processes. We present an experimental evaluation of RedMPI on an assortment of applications to demonstrate the effectiveness of this approach." }
@misc{fiala11tunable2, author = "David Fiala and Kurt Ferreira and Frank Mueller and Christian Engelmann", title = "A Tunable, Software-based DRAM Error Detection and Correction Library for HPC", month = nov # "~12-18, ", year = "2011", howpublished = "{Poster at the \href{http://sc11.supercomputing.org} {24th IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2011}, Seattle, WA, USA}", url = "", abstract = "Proposed exascale systems will present a number of considerable resiliency challenges. In particular, DRAM soft-errors, or bit-flips, are expected to greatly increase due to the increased memory density of these systems. Current hardware-based fault-tolerance methods will be unsuitable for addressing the expected soft error frequency rate. As a result, additional software will be needed to address this challenge. In this paper we introduce LIBSDC, a tunable, transparent silent data corruption detection and correction library for HPC applications. LIBSDC provides comprehensive SDC protection for program memory by implementing on-demand page integrity verification by utilizing the MMU. Experimental benchmarks with Mantevo HPCCG show that once tuned, LIBSDC is able to achieve SDC protection with less than 100\% overhead of resources." }
@misc{scott09tunable2, author = "Stephen L. Scott and Christian Engelmann and Geoffroy R. Vall\'ee and Thomas Naughton and Anand Tikotekar and George Ostrouchov and Chokchai (Box) Leangsuksun and Nichamon Naksinehaboon and Raja Nassar and Mihaela Paun and Frank Mueller and Chao Wang and Arun B. Nagarajan and Jyothish Varma", title = "A Tunable Holistic Resiliency Approach for High-Performance Computing Systems", month = aug # "~12-14, ", year = "2009", howpublished = "{Poster at the \href{http://institute.lanl.gov/resilience/conferences/2009} {National HPC Workshop on Resilience 2009}, Arlington, VA, USA}", url = "http://www.christian-engelmann.info/publications/scott09tunable2.pdf", abstract = "In order to address anticipated high failure rates, resiliency characteristics have become an urgent priority for next-generation extreme-scale high-performance computing (HPC) systems. This poster describes our past and ongoing efforts in novel fault resilience technologies for HPC. Presented work includes proactive fault resilience techniques, system and application reliability models and analyses, failure prediction, transparent process- and virtual-machine-level migration, and trade-off models for combining preemptive migration with checkpoint/restart. This poster summarizes our work and puts all individual technologies into context with a proposed holistic fault resilience framework." }
@misc{scott09systemlevel, author = "Stephen L. Scott and Geoffroy R. Vall\'ee and Thomas Naughton and Anand Tikotekar and Christian Engelmann and Hong H. Ong", title = "System-level Virtualization for for High-Performance Computing", month = aug # "~12-14, ", year = "2009", howpublished = "{Poster at the \href{http://institute.lanl.gov/resilience/conferences/2009} {National HPC Workshop on Resilience 2009}, Arlington, VA, USA}", url = "http://www.christian-engelmann.info/publications/scott09systemlevel.pdf", abstract = "This poster summarizes our past and ongoing research and development efforts in novel system software solutions for providing a virtual system environment (VSE) for next-generation extreme-scale high-performance computing (HPC) systems and beyond. The poster showcases results of developed proof-of-concept implementations and performed theoretical analyses, outlines planned research and development activities, and presents respective initial results." }
@misc{scott09tunable, author = "Stephen L. Scott and Christian Engelmann and Geoffroy R. Vall\'ee and Thomas Naughton and Anand Tikotekar and George Ostrouchov and Chokchai (Box) Leangsuksun and Nichamon Naksinehaboon and Raja Nassar and Mihaela Paun and Frank Mueller and Chao Wang and Arun B. Nagarajan and Jyothish Varma", title = "A Tunable Holistic Resiliency Approach for High-Performance Computing Systems", month = feb # "~14-18, ", year = "2009", howpublished = "{Poster at the \href{http://ppopp09.rice.edu}{$14^{th}$ ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (PPoPP) 2009}, Raleigh, NC, USA}", url = "http://www.christian-engelmann.info/publications/scott09tunable.pdf", abstract = "In order to address anticipated high failure rates, resiliency characteristics have become an urgent priority for next-generation extreme-scale high-performance computing (HPC) systems. This poster describes our past and ongoing efforts in novel fault resilience technologies for HPC. Presented work includes proactive fault resilience techniques, system and application reliability models and analyses, failure prediction, transparent process- and virtual-machine-level migration, and trade-off models for combining preemptive migration with checkpoint/restart. This poster summarizes our work and puts all individual technologies into context with a proposed holistic fault resilience framework." }
@misc{geist08harness, author = "George A. (Al) Geist and Christian Engelmann and Jack J. Dongarra and George Bosilca and Magdalena M. S\l{}awi\'nska and Jaros\l{}aw K. S\l{}awi\'nski", title = "The {Harness} Workbench: {U}nified and Adaptive Access to Diverse High-Performance Computing Platforms", month = mar # "~30 - " # apr # "~5, ", year = "2008", howpublished = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$ High-Performance Computer Science Week (HPCSW) 2008}, Denver, CO, USA}", url = "http://www.christian-engelmann.info/publications/geist08harness.pdf", abstract = "This poster summarizes our past and ongoing research and development efforts in novel software solutions for providing unified and adaptive access to diverse high-performance computing (HPC) platforms. The poster showcases developed proof-of-concept implementations of tools and mechanisms that simplify scientific application development and deployment tasks, such that only minimal adaptation is needed when moving from one HPC system to another or after HPC system upgrades." }
@misc{scott08resiliency, author = "Stephen L. Scott and Christian Engelmann and Hong H. Ong and Geoffroy R. Vall\'ee and Thomas Naughton and Anand Tikotekar and George Ostrouchov and Chokchai (Box) Leangsuksun and Nichamon Naksinehaboon and Raja Nassar and Mihaela Paun and Frank Mueller and Chao Wang and Arun B. Nagarajan and Jyothish Varma and Xubin (Ben) He and Li Ou and Xin Chen", title = "Resiliency for High-Performance Computing Systems", month = mar # "~30 - " # apr # "~5, ", year = "2008", howpublished = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$ High-Performance Computer Science Week (HPCSW) 2008}, Denver, CO, USA}", url = "http://www.christian-engelmann.info/publications/scott08resiliency.pdf", abstract = "This poster summarizes our past and ongoing research and development efforts in novel system software solutions for providing high-level reliability, availability and serviceability (RAS) for next-generation extreme-scale high-performance computing (HPC) systems and beyond. The poster showcases results of developed proof-of-concept implementations and performed theoretical analyses, outlines planned research and development activities, and presents respective initial results." }
@misc{scott08systemlevel, author = "Stephen L. Scott and Geoffroy R. Vall\'ee and Thomas Naughton and Anand Tikotekar and Christian Engelmann and Hong H. Ong", title = "System-level Virtualization for for High-Performance Computing", month = mar # "~30 - " # apr # "~5, ", year = "2008", howpublished = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$ High-Performance Computer Science Week (HPCSW) 2008}, Denver, CO, USA}", url = "http://www.christian-engelmann.info/publications/scott08systemlevel.pdf", abstract = "This poster summarizes our past and ongoing research and development efforts in novel system software solutions for providing a virtual system environment (VSE) for next-generation extreme-scale high-performance computing (HPC) systems and beyond. The poster showcases results of developed proof-of-concept implementations and performed theoretical analyses, outlines planned research and development activities, and presents respective initial results." }
@misc{snir13addressing, author = "Marc Snir and Robert W. Wisniewski and Jacob A. Abraham and Sarita V. Adve and Saurabh Bagchi and Pavan Balaji and Bill Carlson and Andrew A. Chien and Pedro Diniz and Christian Engelmann and Rinku Gupta and Fred Johnson and Jim Belak and Pradip Bose and Franck Cappello and Paul Coteus and Nathan A. Debardeleben and Mattan Erez and Saverio Fazzari and Al Geist and Sriram Krishnamoorthy and Sven Leyffer and Dean Liberty and Subhasish Mitra and Todd Munson and Rob Schreiber and Jon Stearley and Eric Van Hensbergen", title = "Addressing Failures in Exascale Computing", howpublished = "Workshop report", month = apr, year = "2013", url = "http://www.christian-engelmann.info/publications/snir13addressing.pdf" }
@misc{geist12department, author = "Al Geist and Bob Lucas and Marc Snir and Shekhar Borkar and Eric Roman and Mootaz Elnozahy and Bert Still and Andrew Chien and Robert Clay and John Wu and Christian Engelmann and Nathan DeBardeleben and Rob Ross and Larry Kaplan and Martin Schulz and Mike Heroux and Sriram Krishnamoorthy and Lucy Nowell and Abhinav Vishnu and Lee-Ann Talley", title = "U.S. Department of Energy Fault Management Workshop", howpublished = "Workshop report submitted to the U.S. Department of Energy", month = aug, year = "2012", url = "http://www.christian-engelmann.info/publications/geist12department.pdf", abstract = "A Department of Energy (DOE) Fault Management Workshop was held on June 6, 2012 at the BWI Airport Marriot hotel in Maryland. The goals of this workshop were to: 1. Describe the required HPC resilience for critical DOE mission needs; 2. Detail what HPC resilience research is already being done at the DOE national laboratories and is expected to be done by industry or other groups; 3. Determine what fault management research is a priority for DOE's Office of Science and National Nuclear Security Administration (NNSA) over the next five years; 4. Develop a roadmap for getting the necessary research accomplished in the timeframe when it will be needed by the large computing facilities across DOE." }
@misc{engelmann12performance, author = "Christian Engelmann and Thomas Naughton", title = "A Performance/Resilience/Power Co-design Tool for Extreme-scale High-Performance Computing", howpublished = "Whitepaper submitted to the U.S. Department of Energy's Workshop on Modeling & Simulation of Exascale Systems & Applications", month = aug, year = "2012", url = "http://www.christian-engelmann.info/publications/engelmann12performance.pdf", abstract = "Performance, resilience and power consumption are key HPC system design factors that are highly interde-pendent. To enable extreme-scale computing it is essential to perform HPC hardware/software co-design that identifies the cost/benefit trade-off between these design factors for potential future architecture choices. The proposed research and development aims at developing an HPC hardware/software co-design toolkit for evaluating the resilience/power/performance cost/benefit trade-off of future architecture choices. The approach focuses on extending a simulation-based performance investigation toolkit with advanced resilience and power modeling and simulation features, such as (i) fault injection mechanisms, (ii) fault propagation, isolation, and detection models, (i) fault avoidance, masking, and recovery simulation, and (iv) power consumption models." }
@misc{engelmann12dynamic, author = "Christian Engelmann and Geoffroy R. Vall\'ee and Thomas Naughton and Frank Mueller", title = "Dynamic Self-Aware Runtime Software for Exascale Systems", howpublished = "Whitepaper submitted to the U.S. Department of Energy's Exascale Operating Systems and Runtime Technical Council", month = jul, year = "2012", url = "http://www.christian-engelmann.info/publications/engelmann12dynamic.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann12dynamic.ppt.pdf", abstract = "At exascale, the power consumption, resilience, and load balancing constraints, especially their dynamic nature and interdependence, and the scale of the system require a radical change in future high-performance computing (HPC) operating systems and runtimes (OS/Rs). In contrast to the existing static OS/R solutions, an exascale OS/R is needed that is aware of the dynamically changing resources, constraints, and application needs, and that is able to autonomously coordinate (sometimes conflicting) responses to different changes in the system, simultaneously and at scale. To provide awareness and autonomic management, a novel, scalable and self-aware OS/R is needed that becomes the brains of the entire X-stack. It dynamically analyzes past, current, and future system status and application needs. It optimizes system usage by scheduling, migrating, and restarting tasks within and across nodes as needed to deal with multi-dimensional constraints, such as power consumption, permanent and transient faults, resource degradation, heterogeneity, data locality, and load balance." }
@misc{debardeleben09high-end, author = "Nathan DeBardeleben and James Laros and John T. Daly and Stephen L. Scott and Christian Engelmann and Bill Harrod", title = "High-End Computing Resilience: {Analysis} of Issues Facing the {HEC} Community and Path-Forward for Research and Development", howpublished = "Whitepaper submitted to the U.S. National Science Foundation's High-end Computing Program", month = dec, year = "2009", url = "http://www.christian-engelmann.info/publications/debardeleben09high-end.pdf" }
@techreport{fiala12detection, author = "David Fiala and Frank Mueller and Christian Engelmann and Kurt Ferreira and Ron Brightwell and Rolf Riesen", title = "Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing", institution = "Oak Ridge National Laboratory", number = "ORNL/TM-2012/227", address = "Oak Ridge, TN, USA", month = jun, year = "2012", url = "http://www.christian-engelmann.info/publications/fiala12detection.pdf", abstract = "Faults have become the norm rather than the exception for high-end computing on clusters with 10s/100s of thousands of cores. Exacerbating this situation, some of these faults remain undetected, manifesting themselves as silent errors that corrupt memory while applications continue to operate and report incorrect results. This paper studies the potential for redundancy to both detect and correct soft errors in MPI message-passing applications. Our study investigates the challenges inherent to detecting soft errors within MPI application while providing transparent MPI redundancy. By assuming a model wherein corruption in application data manifests itself by producing differing MPI message data between replicas, we study the best suited protocols for detecting and correcting MPI data that is the result of corruption. To experimentally validate our proposed detection and correction protocols, we introduce RedMPI, an MPI library which resides in the MPI profiling layer. RedMPI is capable of both online detection and correction of soft errors that occur in MPI applications without requiring any modifications to the application source by utilizing either double or triple redundancy. Our results indicate that our most efficient consistency protocol can successfully protect applications experiencing even high rates of silent data corruption with runtime overheads between 0\% and 30\% as compared to unprotected applications without redundancy. Using our fault injector within RedMPI, we observe that even a single soft error can have profound effects on running applications, causing a cascading pattern of corruption in most cases causes that spreads to all other processes. RedMPI's protection has been shown to successfully mitigate the effects of soft errors while allowing applications to complete with correct results even in the face of errors." }
@techreport{wang10hybrid, author = "Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Hybrid Full/Incremental Checkpoint/Restart for {MPI} Jobs in {HPC} Environments", institution = "Oak Ridge National Laboratory", number = "ORNL/TM-2010/162", address = "Oak Ridge, TN, USA", month = aug, year = "2010", url = "http://www.christian-engelmann.info/publications/wang10hybrid.pdf", abstract = "As the number of cores in high-performance computing environments keeps increasing, faults are becoming common place. Checkpointing addresses such faults but captures full process images even though only a subset of the process image changes between checkpoints. We have designed a high-performance hybrid disk-based full/incremental checkpointing technique for MPI tasks to capture only data changed since the last checkpoint. Our implementation integrates new BLCR and LAM/MPI features that complement traditional full checkpoints. This results in significantly reduced checkpoint sizes and overheads with only moderate increases in restart overhead. After accounting for cost and savings, benefits due to incremental checkpoints significantly outweigh the loss on restart operations. Experiments in a cluster with the NAS Parallel Benchmark suite and mpiBLAST indicate that savings due to replacing full checkpoints with incremental ones average 16.64 seconds while restore overhead amounts to just 1.17 seconds. These savings increase with the frequency of incremental checkpoints. Overall, our novel hybrid full/incremental checkpointing is superior to prior non-hybrid techniques." }
@techreport{wang10proactive, author = "Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Proactive Process-Level Live Migration and Back Migration in {HPC} Environments", institution = "Oak Ridge National Laboratory", number = "ORNL/TM-2010/161", address = "Oak Ridge, TN, USA", month = aug, year = "2010", url = "http://www.christian-engelmann.info/publications/wang10proactive.pdf", abstract = "As the number of nodes in high-performance computing environments keeps increasing, faults are becoming common place. Reactive fault tolerance (FT) often does not scale due to massive I/O requirements and relies on manual job resubmission. This work complements reactive with proactive FT at the process level. Through health monitoring, a subset of node failures can be anticipated when one's health deteriorates. A novel process-level live migration mechanism suppor ts continued execution of applications during much of processes migration. This scheme is integrated into an MPI execution environment to transparently sustain health-inflicted node failures, which eradicates the need to restart and requeue MPI jobs. Experiments indicate that 1-6.5 seconds of prior warning are required to successfully trigger live process migration while similar operating system virtualization mechanisms require 13-24 seconds. This self-healing approach complements reactive FT by nearly cutting the number of checkpoints in half when 70\% of the faults are handled proactively. The work also provides a novel back migration approach to eliminate load imbalance or bottlenecks caused by migrated tasks. Experiments indicate the larger the amount of outstanding execution, the higher the benefit due to back migration will be." }
@misc{engelmann12high-end, author = "Christian Engelmann", title = "High-End Computing Resilience: Analysis of Issues Facing the HEC Community and Path Forward for Research and Development", month = aug # "~4-11, ", year = "2012", howpublished = "{Invited talk at the Argonne National Laboratory (ANL) Institute of Computing in Science (ICiS) \href{http://www.icis.anl.gov/programs/summer2012-4b} {Summer Workshop Week on Addressing Failures in Exascale Computing}, Park City, UT, USA}", url = "http://www.christian-engelmann.info/publications/engelmann12high-end.ppt.pdf", abstract = "The path to exascale computing poses several research challenges related to power, performance, resilience, productivity, programmability, data movement, and data management. Resilience, i.e., providing efficiency and correctness in the presence of faults, is one of the most important exascale computer science challenges as systems scale up in component count (100,000-1,000,000 nodes with 1,000-10,000 cores per node by 2020) and component reliability decreases (7 nm technology with near-threshold voltage operation by 2020). To provide input for a discussion of future needs in resilience research, development, and standards work, this talk gives a brief summary of the outcomes from the National HPC Workshop on Resilience, held in Arlington, VA, USA on August 12-14, 2009." }
@misc{engelmann12resilience, author = "Christian Engelmann", title = "Resilience for Permanent, Transient, and Undetected Errors", month = mar # "~12-15, ", year = "2012", howpublished = "{Invited talk at the \href{http://www.cs.sandia.gov/Conferences/SOS16} {$16^{th}$ Workshop on Distributed Supercomputing (SOS) 2012}, Santa Barbara, CA, USA}", url = "http://www.christian-engelmann.info/publications/engelmann12resilience.ppt.pdf", abstract = "With the ongoing deployment of 10-20 PFlop/s supercomputers and the exascale roadmap targeting 100, 300, and eventually 1,000 PFlop/s by 2020, the path to exascale computing poses several research challenges related to power, performance, resilience, productivity, programmability, data movement, and data management. Resilience, i.e., providing efficiency and correctness in the presence of faults, is one of the most important exascale computer science challenges as systems scale up in component count (100,000-1,000,000 nodes with 1,000-10,000 cores per node by 2020) and component reliability decreases (7 nm technology with near-threshold voltage operation by 2020). This talk provides an overview of recent and ongoing resilience research and development activities at Oak Ridge National Laboratory, and of future needs in resilience research, development, and standards work." }
@misc{engelmann12scaling, author = "Christian Engelmann", title = "Scaling To A Million Cores And Beyond: A Basic Understanding Of The Challenges Ahead On The Road To Exascale", month = jan # "~24, ", year = "2012", howpublished = "{Invited talk at the \href{https://researcher.ibm.com/researcher/view_page.php?id=2580} {$1^{st}$ International Workshop on Extreme Scale Parallel Architectures and Systems (ESPAS) 2012}, in conjunction with the \href{http://www.hipeac.net/conference/paris}{$7^{th}$ International Conference on High-Performance and Embedded Architectures and Compilers (HiPEAC) 2012}, Paris France}", url = "http://www.christian-engelmann.info/publications/engelmann12scaling.ppt.pdf", abstract = "On the road toward multi-petascale and exascale HPC, the trend in architecture goes clearly in only one direction. HPC systems will dramatically scale up in compute node and processor core counts. By 2020, an exascale system may have up to 1,000,000 compute nodes with 1,000 cores per node. The substantial growth in concurrency causes parallel application scalability issues due to sequential application parts, synchronizing communication, and other bottlenecks. Investigating parallel algorithm performance properties at this scale and with these architectural properties for HPC hardware/software co-design is crucial to enable extreme-scale computing. The presented work utilizes the Extreme-scale Simulator (xSim) performance investigation toolkit to identify the scaling characteristics of a simple Monte Carlo algorithm from 1 to 16 million MPI processes on different multi-core architecture choices. The results show the limitations of strong scaling and the negative impact of employing more but less powerful cores for energy savings." }
@misc{engelmann11resilient, author = "Christian Engelmann", title = "Resilient Software for ExaScale Computing", month = nov # "~17, ", year = "2011", howpublished = "{Invited talk at the Birds of a Feather Session on Resilient Software for ExaScale Computing at the \href{http://sc11.supercomputing.org} {24th IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2011}, Seattle, WA, USA}", url = "http://www.christian-engelmann.info/publications/engelmann11resilient.ppt.pdf", abstract = "ExaScale computing systems will likely consist of millions of cores executing applications with billions of threads, based on 14nm or less CMOS technology, according to the ITRS roadmap. Processing elements built on this technology, coupled with dynamic power management will exhibit high variability in performance, between cores and across different runs. Even worse, preliminary figures indicates that on average about every couple of minutes - at least - something in the system will break. Traditional checkpointing strategies are unlikely to work, given the time it will take to save the huge quantities of data combined with the fact that they will need to be restored frequently. This BoF wants to investigate resilient software: software that is able to survive failing hardware and continue to run, without minimal performance impact. Furthermore, we may also discuss tradeoffs between rerunning the application and the cost of instrumentation to deal with resilience." }
@misc{engelmann11resilience, author = "Christian Engelmann", title = "Resilience and Hardware/Software Co-design for Extreme-Scale Supercomputing", month = jul # "~27, ", year = "2011", howpublished = "{Seminar at the \href{http://www.bsc.es}{Barcelona Supercomputing Center}, Barcelona, Spain}", url = "http://www.christian-engelmann.info/publications/engelmann11resilience.ppt.pdf", abstract = "Oak Ridge National Laboratory (ORNL) provides the most powerful high-performance computing (HPC) resources in the world for open scientific research. Jaguar, a 224,162-core Cray XT5 with a LINPACK performance of 1.759 PFlop/s, for example, is the world's 3rd fastest supercomputer. 80\% of its resources are allocated through a reviewed process to address the most challenging scientific problems in climate modeling, renewable energy, materials science, fusion and other areas. ORNL's Computer Science and Mathematics Division performs computer science and mathematics research to increase supercomputer efficiency and application scientist productivity while accelerating time to solution for scientific breakthroughs. This talk details recent research advancements at ORNL in two areas: (1) resilience and (2) hardware/software co-design for extreme-scale supercomputing. Both are essential on the road toward exa-scale HPC systems with millions-to-billions of cores. Due to the expected drastic increase in scale, the corresponding decrease in system mean-time to interrupt warrants a rethinking of the traditional checkpoint/restart approach for HPC resilience. New concepts discussed in this talk range from preventative measures, such as task migration based on fault prediction, to more aggressive fault masking, such as various levels of redundancy. Further, the expected drastic increase in task parallelism requires redesigning algorithms to avoid the consequences of Amdahl's law at extreme scale. As million-way task parallel systems don't exist yet, this talk discusses a lightweight system simulation approach for performance estimation of algorithms at scale." }
@misc{engelmann10scalable, author = "Christian Engelmann", title = "Scalable HPC System Monitoring", month = oct # "~13, ", year = "2010", howpublished = "{Invited talk at the $3^{rd}$ HPC Resiliency Summit: Workshop on Resiliency for Petascale HPC 2010, in conjunction with the \href{http://www.lanl.gov/conferences/lacss/2010}{$3^{rd}$ Los Alamos Computer Science Symposium (LACSS) 2010}, Santa Fe, NM, USA}", url = "http://www.christian-engelmann.info/publications/engelmann10scalable.ppt.pdf", abstract = "We present a monitoring system for large-scale parallel and distributed computing environments that allows to trade-off accuracy in a tunable fashion to gain scalability without compromising fidelity. The approach relies on classifying each gathered monitoring metric based on individual needs and on aggregating messages containing classes of individual monitoring metrics using a tree-based overlay network. The MRNet-based prototype is able to significantly reduce the amount of gathered and stored monitoring data, e.g., by a factor of ~56 in comparison to the Ganglia distributed monitoring system. A simple scaling study reveals, however, that further efforts are needed in reducing the amount of data to monitor future-generation extreme-scale systems with up to 1,000,000 nodes. The implemented solution did not had a measurable performance impact as the 32-node test system did not produce enough monitoring data to interfere with running applications." }
@misc{engelmann10beyond, author = "Christian Engelmann", title = "Beyond Application-Level Checkpoint/Restart - {Advanced} Software Approaches for Fault Resilience", month = sep # "~6, ", year = "2010", howpublished = "{Talk at the \href{http://www.speedup.ch/workshops/w39_2010.html} {$39^{th}$ SPEEDUP Workshop on High Performance Computing}, Zurich, Switzerland}", url = "http://www.christian-engelmann.info/publications/engelmann10beyond.ppt.pdf" }
@misc{engelmann10reliability, author = "Christian Engelmann and Stephen L. Scott", title = "Reliability, Availability, and Serviceability ({RAS}) for Petascale High-End Computing and Beyond", month = jun # "~22, ", year = "2010", howpublished = "{Talk at the \href{http://www.usenix.org/events/fastos10} {Forum to Address Scalable Technology for Runtime and Operating Systems (FAST-OS) Workshop}, in conjunction with the \href{http://www.usenix.org/events/confweek10}{USENIX Federated Conferences Week (USENIX) 2010}, Boston MA, USA}", url = "http://www.christian-engelmann.info/publications/engelmann10reliability.ppt.pdf", abstract = "This project aims at scalable technologies for providing high-level RAS for next-generation petascale scientific high-performance computing (HPC) resources and beyond as outlined by the U.S. Department of Energy (DOE) Forum to Address Scalable Technology for Runtime and Operating Systems (FAST-OS) and the U.S. National Coordination Office for Networking and Information Technology Research and Development (NCO/NITRD) High-End Computing Revitalization Task Force (HECRTF) activities. Based on virtualized adaptation, reconfiguration, and preemptive measures, the ultimate goal is to provide for non-stop scientific computing on a 24x7 basis without interruption. The taken technical approach leverages system-level virtualization technology to enable transparent proactive and reactive fault tolerance mechanisms on extreme scale HPC systems. This effort targets: (1) reliability analysis for identifying pre-fault indicators, predicting failures, and modeling and monitoring component and system reliability, (2) proactive fault tolerance technology based on preemptive migration away from components that are about to fail, (3) reactive fault tolerance enhancements, such as checkpoint interval and placement adaptation to actual and predicted system health threats, and (4) holistic fault tolerance through combination of adaptive proactive and reactive fault tolerance." }
@misc{engelmann10resilience, author = "Christian Engelmann", title = "Resilience Challenges at the Exascale", month = mar # "~8-11, ", year = "2010", howpublished = "{Talk at the \href{http://www.csm.ornl.gov/workshops/SOS14}{$14^{th}$ Workshop on Distributed Supercomputing (SOS) 2010}, Savannah, GA, USA}", url = "http://www.christian-engelmann.info/publications/engelmann10resilience.ppt.pdf", abstract = "The path to exascale computing poses several research challenges related to power, performance, resilience, productivity, programmability, data movement, and data management. Resilience, i.e., providing efficiency and correctness in the presence of faults, is one of the most important exascale computer science challenges as systems scale up in component count and component reliability decreases. This talk discusses the future needs in resilience research, development, and standards work based on the outcomes from the National HPC Workshop on Resilience, held in Arlington, VA, USA on August 12-14, 2009." }
@misc{engelmann10hpc, author = "Christian Engelmann and Stephen L. Scott", title = "{HPC} System Software Research at {Oak Ridge National Laboratory}", month = feb # "~22, ", year = "2010", howpublished = "{Seminar at the \href{http://www.lrz-muenchen.de}{Leibniz Rechenzentrum (LRZ)}, Garching, Germany}", url = "http://www.christian-engelmann.info/publications/engelmann10hpc.ppt.pdf", abstract = "Oak Ridge National Laboratory (ORNL) is the largest energy laboratory in the United States. Its National Center for Computational Sciences (NCCS) provides the most powerful computing resources in the world for open scientific research. Jaguar, a Cray XT5 system at NCCS, is the fastest supercomputer in the world. It recently ranked #1 in the Top 500 List of Supercomputer Sites with a maximal LINPACK benchmark performance of 1.759 PFlop/s and a theoretical peak performance of 2.331 PFlop/s, where 1 PFlop/s is $10^{15}$ Floating Point Operations Per Second. Annually, 80 percent of Jaguar's resources are allocated through the U.S Department of Energy's Innovative and Novel Computational Impact on Theory and Experiment (INCITE) program, a competitively selected, peer reviewed process open to researchers from universities, industry, government and non-profit organizations. These allocations address some of the most challenging scientific problems in areas such as climate modeling, renewable energy, materials science, fusion and combustion. In conjunction with NCCS, the Computer Science and Mathematics Division at ORNL performs basic and applied research in HPC, mathematics, and intelligent systems. This talk gives a summary of the HPC research and development in system software performed at ORNL, including resilience at extreme scale and virtualization technologies in HPC. Specifically, this talk will focus on advanced resilience technologies, such as migration of computation away from components that are about to fail and on management and customization of virtualized environments." }
@misc{engelmann09high2, author = "Christian Engelmann", title = "High-Performance Computing Research Internship and Appointment Opportunities at {Oak Ridge National Laboratory}", month = dec # "~14, ", year = "2009", howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk} {University of Reading}, Reading, United Kingdom}", url = "http://www.christian-engelmann.info/publications/engelmann09high2.ppt.pdf", abstract = "Oak Ridge National Laboratory (ORNL) is the largest energy laboratory in the United States. Its National Center for Computational Sciences (NCCS) provides the most powerful computing resources in the world for open scientific research. Jaguar, a Cray XT5 system at NCCS, is the fastest supercomputer in the world. It recently ranked #1 in the Top 500 List of Supercomputer Sites with a maximal LINPACK benchmark performance of 1.759 PFlop/s and a theoretical peak performance of 2.331 PFlop/s, where 1 PFlop/s is $10^{15}$ Floating Point Operations Per Second. Annually, 80 percent of Jaguar's resources are allocated through the U.S Department of Energy's Innovative and Novel Computational Impact on Theory and Experiment (INCITE) program, a competitively selected, peer reviewed process open to researchers from universities, industry, government and non-profit organizations. These allocations address some of the most challenging scientific problems in areas such as climate modeling, renewable energy, materials science, fusion and combustion. In conjunction with NCCS, the Computer Science and Mathematics Division at ORNL performs basic and applied research in HPC, mathematics, and intelligent systems. This talk gives a summary of the HPC research performed at ORNL. It provides details about the Jaguar peta-scale computing resource, an overview of the computational science research carried out using ORNL's computing resources, and a description of various computer science efforts targeting solutions for next-generation HPC systems. This talk also provides information about internship opportunities for MSc students and research appointment opportunities for recent graduates." }
@misc{engelmann09jcas, author = "Christian Engelmann", title = "{JCAS} - {IAA} Simulation Efforts at {Oak Ridge National Laboratory}", month = sep # "~1-2, ", year = "2009", howpublished = "{Invited talk at the \href{http://www.cs.sandia.gov/CSRI/Workshops/2009/IAA} {IAA Workshop on HPC Architectural Simulation (HPCAS)}, Boulder, CO, USA}", url = "http://www.christian-engelmann.info/publications/engelmann09jcas.ppt.pdf" }
@misc{engelmann09modeling, author = "Christian Engelmann", title = "Modeling Techniques Towards Resilience", month = aug # "~12-14, ", year = "2009", howpublished = "{Invited talk at the \href{http://institute.lanl.gov/resilience/conferences/2009} {National HPC Workshop on Resilience 2009}, Arlington, VA, USA}", url = "http://www.christian-engelmann.info/publications/engelmann09modeling.ppt.pdf" }
@misc{engelmann09system, author = "Christian Engelmann", title = "System Resilience Research at {ORNL} in the Context of {HPC}", month = may # "~15, ", year = "2009", howpublished = "{Invited talk at the \href{http://www.inria.fr/inria/organigramme/fiche_ur-ren.fr.html} {Institut National de Recherche en Informatique et en Automatique (INRIA)}, Rennes, France}", url = "http://www.christian-engelmann.info/publications/engelmann09system.pdf", abstract = "The continuing growth in high performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). With only very few exceptions, the availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. As a result, sites lower allowable job run times in order to force applications to store intermediate results (checkpoints) as insurance against lost computation time. However, checkpoints themselves waste valuable computation time and resources. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically with the trend towards capability computing, which drives the race for scientific discovery by running applications on the fastest machines available while desiring significant amounts of time (weeks and months) without interruption. These machines must be able to run in the event of frequent interrupts in such a manner that the capability is not severely degraded. Thus, research and development of scalable RAS technologies is paramount to the success of future extreme-scale systems. This talk summarizes our accomplishments in the area of high-level RAS for HPC, such as developed concepts and implemented proof-of-concept prototypes." }
@misc{engelmann09high, author = "Christian Engelmann", title = "High-Performance Computing Research and {MSc} Internship Opportunities at {Oak Ridge National Laboratory}", month = may # "~11, ", year = "2009", howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk} {University of Reading}, Reading, United Kingdom}", url = "http://www.christian-engelmann.info/publications/engelmann09high.pdf", abstract = "Oak Ridge National Laboratory (ORNL) is the largest energy laboratory in the United States. Its National Center for Computational Sciences (NCCS) provides the most powerful computing resources in the world for open scientific research. Jaguar, a Cray XT5 system at NCCS, is the second HPC system to exceed 1 PFlop/s ($10^{15}$ Floating Point Operations Per Second), and the fastest open science supercomputer in the world. It recently ranked #2 in the Top 500 List of Supercomputer Sites with a maximal LINPACK benchmark performance of 1.059 PFlop/s and a theoretical peak performance of 1.3814 PFlop/s. Annually, 80 percent of Jaguar's resources are allocated through the U.S Department of Energy's Innovative and Novel Computational Impact on Theory and Experiment (INCITE) program, a competitively selected, peer reviewed process open to researchers from universities, industry, government and non-profit organizations. These allocations address some of the most challenging scientific problems in areas such as climate modeling, renewable energy, materials science, fusion and combustion. In conjunction with NCCS, the Computer Science and Mathematics Division at ORNL performs basic and applied research in HPC, mathematics, and intelligent systems. This talk gives a summary of the HPC research performed at ORNL. It provides details about the Jaguar peta-scale computing resource, an overview of the computational science research carried out using ORNL's computing resources, and a description of various computer science efforts targeting solutions for next-generation HPC systems. This talk also provides information about internship opportunities for MSc students." }
@misc{engelmann09modular, author = "Christian Engelmann", title = "Modular Redundancy for Soft-Error Resilience in Large-Scale {HPC} Systems", month = may # "~3-8, ", year = "2009", howpublished = "{Invited talk at the \href{http://www.dagstuhl.de/en/program/calendar/semhp/?semnr=09191} {Dagstuhl Seminar on Fault Tolerance in High-Performance Computing and Grids}, Schloss Dagstuhl, Wadern, Germany}", url = "http://www.christian-engelmann.info/publications/engelmann09modular.pdf", abstract = "Recent investigations into resilience of large-scale high-performance computing (HPC) systems showed a continuous trend of decreasing reliability and availability. Newly installed systems have a lower mean-time to failure (MTTF) and a higher mean-time to recover (MTTR) than their predecessors. Modular redundancy is being used in many mission critical systems today to provide for resilience, such as for aerospace and command & control systems. The primary argument against modular redundancy for resilience in HPC has always been that the capability of a HPC system, and respective return on investment, would be significantly reduced. We argue that modular redundancy can significantly increase compute node availability as it removes the impact of scale from single compute node MTTR. We further argue that single compute nodes can be much less reliable, and therefore less expensive, and still be highly available, if their MTTR/MTTF ratio is maintained." }
@misc{engelmann09proactive2, author = "Christian Engelmann", title = "Proactive Fault Tolerance Using Preemptive Migration", month = apr # "~22-24, ", year = "2009", howpublished = "{Invited talk at the \href{http://acet.rdg.ac.uk/events/details/cancun.php} {$3^{rd}$ Collaborative and Grid Computing Technologies Workshop (CGCTW) 2009}, Cancun, Mexico}", url = "http://www.christian-engelmann.info/publications/engelmann09proactive2.pdf", abstract = "The continuing growth in high-performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). In order to address anticipated high failure rates, resiliency characteristics have become an urgent priority for next-generation HPC systems. The concept of proactive fault tolerance prevents compute node failures from impacting running parallel applications by preemptively migrating application parts away from nodes that are about to fail. This talk presents our past and ongoing efforts in proactive fault resilience for HPC. Presented work includes proactive fault resilience techniques, transparent process- and virtual-machine-level migration, system and application reliability models and analyses, failure prediction, and trade-off models for combining preemptive migration with checkpoint/restart. All these individual technologies are put into context with a proposed holistic HPC fault resilience framework." }
@misc{engelmann09resiliency, author = "Christian Engelmann", title = "Resiliency", month = mar # "~9-12, ", year = "2009", howpublished = "{Panel at the \href{http://www.cs.sandia.gov/Conferences/SOS13}{$13^{th}$ Workshop on Distributed Supercomputing (SOS) 2009}, Hilton Head, SC, USA}" }
@misc{engelmann08high, author = "Christian Engelmann", title = "High-Performance Computing Research at {Oak Ridge National Laboratory}", month = dec # "~8, ", year = "2008", howpublished = "{Invited talk at the Reading Annual Computational Science Workshop, Reading, United Kingdom}", url = "http://www.christian-engelmann.info/publications/engelmann08high.pdf", abstract = "Oak Ridge National Laboratory (ORNL) is the largest energy laboratory in the United States. Its National Center for Computational Sciences (NCCS) provides the most powerful computing resources in the world for open scientific research. Jaguar, a Cray XT5 system at NCCS, is the second HPC system to exceed 1 PFlop/s (10^15 Floating Point Operations Per Second), and the fastest open science supercomputer in the world. It recently ranked #2 in the Top 500 List of Supercomputer Sites with a maximal LINPACK benchmark performance of 1.059 PFlop/s and a theoretical peak performance of 1.3814 PFlop/s. Annually, 80 percent of Jaguar’s resources are allocated through the U.S Department of Energy’s Innovative and Novel Computational Impact on Theory and Experiment (INCITE) program, a competitively selected, peer reviewed process open to researchers from universities, industry, government and non-profit organizations. These allocations address some of the most challenging scientific problems in areas such as climate modeling, renewable energy, materials science, fusion and combustion. In conjunction with NCCS, the Computer Science and Mathematics Division at ORNL performs basic and applied research in HPC, mathematics, and intelligent systems. This talk gives a summary of the HPC research performed at ORNL. It provides details about the Jaguar peta-scale computing resource, an overview of the computational science research carried out using ORNL’s computing resources, and a description of various computer science efforts targeting solutions for next-generation HPC systems." }
@misc{engelmann08modular, author = "Christian Engelmann", title = "Modular Redundancy in {HPC} Systems: {W}hy, Where, When and How?", month = oct # "~15, ", year = "2008", howpublished = "{Invited talk at the $1^{st}$ HPC Resiliency Summit: Workshop on Resiliency for Petascale HPC 2008, in conjunction with the \href{http://www.lanl.gov/conferences/lacss/2008}{$1^{st}$ Los Alamos Computer Science Symposium (LACSS) 2008}, Santa Fe, NM, USA}", url = "http://www.christian-engelmann.info/publications/engelmann08modular.ppt.pdf", abstract = "The continuing growth in high-performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). With only very few exceptions, the availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. As a result, sites lower allowable job run times in order to force applications to store intermediate results (checkpoints) as insurance against lost computation time. However, checkpoints themselves waste valuable computation time and resources. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically with the trend towards capability computing, which drives the race for scientific discovery by running applications on the fastest machines available while desiring significant amounts of time (weeks and months) without interruption. These machines must be able to run in the event of frequent interrupts in such a manner that the capability is not severely degraded. Thus, research and development of scalable RAS technologies is paramount to the success of future extreme-scale systems. This talk summarizes our past accomplishments, ongoing work, and future plans in the area of high-level RAS for HPC." }
@misc{engelmann08resiliency, author = "Christian Engelmann", title = "Resiliency for High-Performance Computing", month = apr # "~10-12, ", year = "2008", howpublished = "{Invited talk at the \href{http://acet.rdg.ac.uk/events/details/cancun.php} {$2^{nd}$ Collaborative and Grid Computing Technologies Workshop (CGCTW) 2008}, Cancun, Mexico}", url = "http://www.christian-engelmann.info/publications/engelmann08resiliency.ppt.pdf", abstract = "In order to address anticipated high failure rates, resiliency characteristics have become an urgent priority for next-generation high-performance computing (HPC) systems. One major source of concern are non-recoverable soft errors, i.e., bit flips in memory, cache, registers, and logic. The probability of such errors not only grows with system size, but also with increasing architectural vulnerability caused by employing accelerators and by shrinking nanometer technology. Reactive fault tolerance technologies, such as checkpoint/restart, are unable to handle high failure rates due to associated overheads, while proactive resiliency technologies, such as preemptive migration, simply fail as random soft errors can't be predicted. This talk proposes a new, bold direction in resiliency for HPC as it targets resiliency for next-generation extreme-scale HPC systems at the system software level through computational redundancy strategies, i.e., dual- and triple-modular redundancy." }
@misc{engelmann08advanced, author = "Christian Engelmann", title = "Advanced Fault Tolerance Solutions for High Performance Computing", month = feb # "~11, ", year = "2008", howpublished = "{Seminar at the \href{http://www.laas.fr}{Laboratoire d'Analyse et d'Architecture des Syst\`emes}, \href{http://www.cnrs.fr}{Centre National de la Recherche Scientifique}, Toulouse, France}", url = "http://www.christian-engelmann.info/publications/engelmann08advanced.ppt.pdf", abstract = "The continuing growth in high performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). With only very few exceptions, the availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. As a result, sites lower allowable job run times in order to force applications to store intermediate results (checkpoints) as insurance against lost computation time. However, checkpoints themselves waste valuable computation time and resources. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically with the trend towards capability computing, which drives the race for scientific discovery by running applications on the fastest machines available while desiring significant amounts of time (weeks and months) without interruption. These machines must be able to run in the event of frequent interrupts in such a manner that the capability is not severely degraded. Thus, research and development of scalable RAS technologies is paramount to the success of future extreme-scale systems. This talk summarizes our accomplishments in the area of high-level RAS for HPC, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." }
@misc{engelmann07service, author = "Christian Engelmann", title = "Service-Level High Availability in Parallel and Distributed Systems", month = oct # "~10, ", year = "2007", howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk} {University of Reading}, Reading, United Kingdom}", url = "http://www.christian-engelmann.info/publications/engelmann07service.pdf", abstract = "As service-oriented architectures become more important in parallel and distributed computing systems, individual service instance reliability as well as appropriate service redundancy are essential to increase overall system availability. This talk focuses on redundancy strategies using service-level replication techniques. An overview of existing programming models for service-level high availability is presented and their differences, similarities, advantages, and disadvantages are discussed. Recent advances in providing service-level symmetric active/active high availability are discussed. While the primary target of the presented research is high availability for service nodes in tightly-coupled extreme-scale high-performance computing (HPC) systems, it is also applicable to loosely-coupled distributed computing scenarios." }
@misc{engelmann07advanced2, author = "Christian Engelmann", title = "Advanced Fault Tolerance Solutions for High Performance Computing", month = jun # "~8, ", year = "2007", howpublished = "{Invited talk at the \href{http://www.thaigrid.or.th/wttc2007}{Workshop on Trends, Technologies and Collaborative Opportunities in High Performance and Grid Computing (WTTC) 2007}, Khon Kean, Thailand}", url = "http://www.christian-engelmann.info/publications/engelmann07advanced2.ppt.pdf", abstract = "The continuing growth in high performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). With only very few exceptions, the availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. As a result, sites lower allowable job run times in order to force applications to store intermediate results (checkpoints) as insurance against lost computation time. However, checkpoints themselves waste valuable computation time and resources. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically with the trend towards capability computing, which drives the race for scientific discovery by running applications on the fastest machines available while desiring significant amounts of time (weeks and months) without interruption. These machines must be able to run in the event of frequent interrupts in such a manner that the capability is not severely degraded. Thus, research and development of scalable RAS technologies is paramount to the success of future extreme-scale systems. This talk summarizes our accomplishments in the area of high-level RAS for HPC, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." }
@misc{engelmann07advanced, author = "Christian Engelmann", title = "Advanced Fault Tolerance Solutions for High Performance Computing", month = jun # "~4-5, ", year = "2007", howpublished = "{Invited talk at the \href{http://www.thaigrid.or.th/wttc2007}{Workshop on Trends, Technologies and Collaborative Opportunities in High Performance and Grid Computing (WTTC) 2007}, Khon Kean, Thailand}", url = "http://www.christian-engelmann.info/publications/engelmann07advanced.ppt.pdf", abstract = "The continuing growth in high performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). With only very few exceptions, the availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. As a result, sites lower allowable job run times in order to force applications to store intermediate results (checkpoints) as insurance against lost computation time. However, checkpoints themselves waste valuable computation time and resources. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically with the trend towards capability computing, which drives the race for scientific discovery by running applications on the fastest machines available while desiring significant amounts of time (weeks and months) without interruption. These machines must be able to run in the event of frequent interrupts in such a manner that the capability is not severely degraded. Thus, research and development of scalable RAS technologies is paramount to the success of future extreme-scale systems. This talk summarizes our accomplishments in the area of high-level RAS for HPC, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." }
@misc{engelmann07operating, author = "Christian Engelmann", title = "Operating System Research at {ORNL}: {S}ystem-level Virtualization", month = apr # "~10, ", year = "2007", howpublished = "{Seminar at the \href{http://www.gup.uni-linz.ac.at} {Institute of Graphics and Parallel Processing}, \href{http://www.uni-linz.ac.at}{Johannes Kepler University}, Linz, Austria}", url = "http://www.christian-engelmann.info/publications/engelmann07operating.ppt.pdf", abstract = "The emergence of virtualization enabled hardware, such as the latest generation AMD and Intel processors, has raised significant interest in High Performance Computing (HPC) community. In particular, system-level virtualization provides an opportunity to advance the design and development of operating systems, programming environments, administration practices, and resource management tools. This leads to some potential research topics for HPC, such as failure tolerance, system management, and solutions for application porting to new HPC platforms. This talk will present an overview of the research in System-level Virtualization taking place by the Systems Research Team in the Computer Science Research Group at Oak Ridge National Laboratory." }
@misc{engelmann07towards, author = "Christian Engelmann", title = "Towards High Availability for High-Performance Computing System Services: {A}ccomplishments and Limitations", month = mar # "~14, ", year = "2007", howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk} {University of Reading}, Reading, United Kingdom}", url = "http://www.christian-engelmann.info/publications/engelmann07towards.pdf", abstract = "During the last several years, our teams at Oak Ridge National Laboratory, Louisiana Tech University, and Tennessee Technological University focused on efficient redundancy strategies for head and service nodes of high-performance computing (HPC) systems in order to pave the way for high availability (HA) in HPC. These nodes typically run critical HPC system services, like job and resource management, and represent single points of failure and control for an entire HPC system. The overarching goal of our research is to provide high-level reliability, availability, and serviceability (RAS) for HPC systems by combining HA and HPC technology. This talk summarizes our accomplishments, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." }
@misc{engelmann06high, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = jun # "~9, ", year = "2006", howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk} {University of Reading}, Reading, United Kingdom}", url = "http://www.christian-engelmann.info/publications/engelmann06high.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of our research is to expand today`s effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. This talk will present an overview of recent research at Oak Ridge National Laboratory in high availability solutions for ultra-scale scientific high-end computing." }
@misc{scott06advancing, author = "Stephen L. Scott and Christian Engelmann", title = "Advancing Reliability, Availability and Serviceability for High-Performance Computing", month = apr # "~19, ", year = "2006", howpublished = "{Seminar at the \href{http://www.gup.uni-linz.ac.at} {Institute of Graphics and Parallel Processing}, \href{http://www.uni-linz.ac.at}{Johannes Kepler University}, Linz, Austria}", url = "http://www.christian-engelmann.info/publications/scott06advancing.ppt.pdf", abstract = "Today’s high performance computing systems have several reliability deficiencies resulting in noticeable availability and serviceability issues. For example, head and service nodes represent a single point of failure and control for an entire system as they render it inaccessible and unmanageable in case of a failure until repair, causing a significant downtime. Furthermore, current solutions for fault-tolerance focus on dealing with the result of a failure. However, most are unable to transparently mask runtime system configuration changes caused by failures and require a complete restart of essential system services, such as MPI, in case of a failure. High availability computing strives to avoid the problems of unexpected failures through preemptive measures. The overall goal of our research is to expand today’s effort in high availability for high-performance computing, so that systems can be kept alive by an OS runtime environment that understands the concepts of dynamic system configuration and degraded operation mode. This talk will present an overview of recent research performed at Oak Ridge National Laboratory in collaboration with Louisiana Tech University, North Carolina State University and the University of Reading in developing core technologies and proof-of-concept prototypes that improve the overall reliability, availability and serviceability of high-performance computing systems." }
@misc{engelmann05high4, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = oct # "~18, ", year = "2005", howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk} {University of Reading}, Reading, United Kingdom}", url = "http://www.christian-engelmann.info/publications/engelmann05high4.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of our research is to expand today`s effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. This talk will present an overview of recent research at Oak Ridge National Laboratory in high availability solutions for ultra-scale scientific high-end computing." }
@misc{engelmann05high3, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = sep # "~26, ", year = "2005", howpublished = "{Seminar at the \href{http://www.uncfsu.edu/macsc}{Department of Mathematics and Computer Science}, \href{http://www.uncfsu.edu}{Fayetteville State University}, Fayetteville, NC, USA}", url = "http://www.christian-engelmann.info/publications/engelmann05high3.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of our research is to expand today’s effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. This talk will present an overview of recent research at Oak Ridge National Laboratory in fault tolerance and high availability solutions for ultra-scale scientific high-end computing." }
@misc{engelmann05high2, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = may # "~13, ", year = "2005", howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk} {University of Reading}, Reading, United Kingdom}", url = "http://www.christian-engelmann.info/publications/engelmann05high2.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of our research is to expand today’s effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. This talk will present an overview of recent research at Oak Ridge National Laboratory in fault-tolerant heterogeneous metacomputing, advanced super-scalable algorithms and high availability system software for ultra-scale scientific high-end computing." }
@misc{engelmann05high1, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = apr # "~15, ", year = "2005", howpublished = "{Seminar at the \href{http://cenit.latech.edu}{Center for Entrepreneurship and Information Technology}, \href{http://www.latech.edu}{Louisiana Tech University}, Ruston, LA, USA}", url = "http://www.christian-engelmann.info/publications/engelmann05high1.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of this research is to expand today’s effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. With the aim of addressing the future challenges of high availability in ultra-scale HEC, this project intends to develop a proof-of-concept implementation of an active/active high availability system software framework." }
@misc{engelmann04diskless, author = "Christian Engelmann", title = "Diskless Checkpointing on Super-scale Architectures -- {A}pplied to the Fast Fourier Transform", month = feb # "~25, ", year = "2004", howpublished = "{Invited talk at the \href{http://www.siam.org/meetings/pp04} {$11^{th}$ SIAM Conference on Parallel Processing for Scientific Computing (SIAM PP) 2004}, San Francisco, CA, USA}", url = "http://www.christian-engelmann.info/publications/engelmann04diskless.ppt.pdf", abstract = "This talk discusses the issue of fault-tolerance in distributed computer systems with tens or hundreds of thousands of diskless processor units. Such systems, like the IBM Blue Gene/L, are predicted to be deployed in the next five to ten years. Since a 100,000-processor system is going to be less reliable, scientific applications need to be able to recover from occurring failures more efficiently. In this paper, we adapt the present technique of diskless checkpointing to such huge distributed systems in order to equip existing scientific algorithms with super-scalable fault-tolerance. First, we discuss the method of diskless checkpointing, then we adapt this technique to super-scale architectures and finally we present results from an implementation of the Fast Fourier Transform that uses the adapted technique to achieve super-scale fault-tolerance." }
@misc{engelmann04superscalable, author = "Christian Engelmann", title = "Super-scalable Algorithms -- {N}ext Generation Supercomputing on 100,000 and more Processors", month = jan # "~29, ", year = "2004", howpublished = "{Seminar at the \href{http://www.csm.ornl.gov}{Computer Science and Mathematics Division}, \href{http://www.ornl.gov} {Oak Ridge National Laboratory}, Oak Ridge, TN, USA}", url = "http://www.christian-engelmann.info/publications/engelmann04superscalable.ppt.pdf", abstract = "This talk discusses recent research into the issues and potential problems of algorithm scalability and fault-tolerance on next-generation high-performance computer systems with tens and even hundreds of thousands of processors. Such massively parallel computers, like the IBM Blue Gene/L, are going to be deployed in the next five to ten years and existing deficiencies in scalability and fault-tolerance need to be addressed soon. Scientific algorithms have shown poor scalability on 10,000-processor systems that exist today. Furthermore, future systems will be less reliable due to the large number of components. Super-scalable algorithms, which have the properties of scale invariance and natural fault-tolerance, are able to get the correct answer despite multiple task failures and without checkpointing. We will show that such algorithms exist for a wide variety of problems, such as finite difference, finite element, multigrid and global maximum. Despite these findings, traditional algorithms may still be preferred due to their known behavior, or simply because a super-scalable algorithm does not exist or is hard to find for a particular problem. In this case, we propose a peer-to-peer diskless checkpointing algorithm that can provide scale invariant fault-tolerance." }
@misc{engelmann03distributed, author = "Christian Engelmann", title = "Distributed Peer-to-Peer Control for {Harness}", month = feb # "~11, ", year = "2004", howpublished = "{Seminar at the \href{http://www.csc.ncsu.edu}{Department of Computer Science}, \href{http://www.ncsu.edu}{North Carolina State University}, Raleigh, NC, USA}", url = "http://www.christian-engelmann.info/publications/engelmann03distributed.ppt.pdf", abstract = "Harness is an adaptable fault-tolerant virtual machine environment for next-generation heterogeneous distributed computing developed as a follow on to PVM. It additionally enables the assembly of applications from plug-ins and provides fault-tolerance. This work describes the distributed control, which manages global state replication to ensure a high-availability of service. Group communication services achieve an agreement on an initial global state and a linear history of global state changes at all members of the distributed virtual machine. This global state is replicated to all members to easily recover from single, multiple and cascaded faults. A peer-to-peer ring network architecture and tunable multi-point failure conditions provide heterogeneity and scalability. Finally, the integration of the distributed control into the multi-threaded kernel architecture of Harness offers a fault-tolerant global state database service for plug-ins and applications." }
@mastersthesis{jones10simulation, author = "Ian S. Jones", title = "Simulation of Large Scale Architectures on High Performance Computers", month = oct # "~22, ", year = "2010", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Advisors: Prof. Vassil N. Alexandrov (University of Reading); Christian Engelmann (Oak Ridge National Laboratory); George Bosilca (University of Tennessee, Knoxville)", url = "http://www.christian-engelmann.info/publications/jones10simulation.pdf", url2 = "http://www.christian-engelmann.info/publications/jones10simulation.ppt.pdf", abstract = "Powerful supercomputers often need to be simulated for the purposes of testing the scalability of various applications. This thesis endeavours to further develop the existing simulator, XSIM, and implement the functionality to simulate real-world networks and the latency which might be encountered by messages travelling through that network. The upgraded simulator will then be tested at the Oak Ridge National Laboratory. The work completed herein should provide a solid foundation for further improvements to XSIM; it simulates a variety of basic network topologies, calculating the shortest path for any given message and generates a transmission time." }
@mastersthesis{boehm10development, author = "Swen B{\"o}hm", title = "Development of a {RAS} Framework for {HPC} Environments: {Realtime} Data Reduction of Monitoring Data", month = mar # "~12, ", year = "2010", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Advisors: Prof. Vassil N. Alexandrov (University of Reading); Christian Engelmann (Oak Ridge National Laboratory); George Bosilca (University of Tennessee, Knoxville)", url = "http://www.christian-engelmann.info/publications/boehm10development.pdf", url2 = "http://www.christian-engelmann.info/publications/boehm10development.ppt.pdf", abstract = "The advancements of high-performance computing (HPC) systems in the last decades lead to more and more complex systems containing thousands or tens-of-thousands computing systems that are working together. While the computational performance of these systems increased dramaticaly in the last years the I/O subsystems have not gained such a significant improvement. With increasing nummbers of hardware components in the next generation HPC systems maintaining the relaiability of such systems becomes more and more difficult since the probability of hardware failures is increasing with the number of components. The capacities of traditional reactive fault tolerance technologies are exceeded by the development of next generation systems and alternatives have to be found. This paper discusses a monitoring system that is using data reduction techniques to decrease the amount of the collected data. The system is part of a proactive fault tolerance system that may challenge the reliability problems of exascale HPC systems." }
@mastersthesis{lauer10simulation, author = "Frank Lauer", title = "Simulation of Advanced Large-Scale {HPC} Architectures", month = mar # "~12, ", year = "2010", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Advisors: Prof. Vassil N. Alexandrov (University of Reading); Christian Engelmann (Oak Ridge National Laboratory); George Bosilca (University of Tennessee, Knoxville)", url = "http://www.christian-engelmann.info/publications/lauer10simulation.pdf", url2 = "http://www.christian-engelmann.info/publications/lauer10simulation.ppt.pdf", abstract = "The rapid development of massive parallel systems in the high- performance computing (HPC) area requires efficient scalability of applications. The next generation's design of supercomputers is today not certain in terms of what will be the computational, memory and I/O capabilities. However it is most certain that they become even more parallel. Getting the most performance from these machines in not only a matter of hardware, it is also an issue of programming design. Therefore, it has to be a co-development. However, how to test algorithm's on machines which are not existing today. To address the programming issues in terms of scalability and fault tolerance for the next generation, this projects aim is to design and develop a simulator based on parallel discrete event simulation (PDES) for applications using MPI communication. Some of the fastest supercomputers in the world already interconnecting $10^5$ cores together to catch up the simulator will be able to simulate at least $10^7$ virtual processes." }
@mastersthesis{litvinova09ras, author = "Antonina Litvinova", title = "{RAS} Framework Engine Prototype", month = sep # "~22, ", year = "2009", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Advisors: Prof. Vassil N. Alexandrov (University of Reading); Christian Engelmann (Oak Ridge National Laboratory); George Bosilca (University of Tennessee, Knoxville)", url = "http://www.christian-engelmann.info/publications/litvinova09ras.pdf", url2 = "http://www.christian-engelmann.info/publications/litvinova09ras.ppt.pdf", abstract = "Extreme high performance computing (HPC) systems constantly increase in scale from a few thousands of processors cores to thousands of thousands of processors cores and beyond. However their system mean-time to interrupt decreases according. The current approach of fault tolerance in HPC is checkpoint/restart, i.e. a method based on recovery from experienced failures. However checkpoint/restart cannot deal with errors in the same efficient way anymore, because of HPC systems modification. For example, increasing error rates, increasing aggregate memory, and not proportionally increasing input/output capabilities. The recently introduced concept is proactive fault tolerance which avoids experiencing failures through preventative measures. Proactive fault tolerance uses migration which is an emerging technology that prevents failures on HPC systems by migrating applications or application parts away from a node that is deteriorating to a spare node. This thesis discusses work conducted at ORNL to develop a Proactive Fault Tolerance Framework Engine Prototype for HPC systems with high reliability, availability and serviceability. The prototype performs environmental system monitoring, system event logging, parallel job monitoring and system resource monitoring in order to analyse HPC system reliability and to perform fault avoidance through a migration." }
@mastersthesis{koenning07virtualized, author = "Bj{\"o}rn K{\"o}nning", title = "Virtualized Environments for the {Harness Workbench}", month = mar # "~14, ", year = "2007", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Advisors: Prof. Vassil N. Alexandrov (University of Reading); Christian Engelmann (Oak Ridge National Laboratory)", url = "http://www.christian-engelmann.info/publications/koenning07virtualized.pdf", url2 = "http://www.christian-engelmann.info/publications/koenning07virtualized.ppt.pdf", abstract = "The expanded use of computational sciences today leads to a significant need of high performance computing systems. High performance computing is currently undergoing vigorous revival, and multiple efforts are underway to develop much faster computing systems in the near future. New software tools are required for the efficient use of petascale computing systems. With the new Harness Workbench Project the Oak Ridge National Laboratory intends to develop an appropriate development and runtime environment for high performance computing platforms. This dissertation project is part of the Harness Workbench Project, and deals with the development of a concept for virtualised environments and various approaches to create and describe them. The developed virtualisation approach is based on the \verb|chroot| mechanism and uses platform-independent environment descriptions. File structures and environment variables are emulated to provide the portability of computational software over diverse high performance computing platforms. Security measures and sandbox characteristic are integrable." }
@mastersthesis{weber07high, author = "Matthias Weber", title = "High Availability for the {Lustre} File System", month = mar # "~14, ", year = "2007", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Double diploma in conjunction with the \href{http://www.f1.fhtw-berlin.de}{Department of Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical College for Engineering and Economics (FHTW) Berlin}, Germany. Advisors: Prof. Vassil N. Alexandrov (University of Reading); Christian Engelmann (Oak Ridge National Laboratory)", url = "http://www.christian-engelmann.info/publications/weber07high.pdf", url2 = "http://www.christian-engelmann.info/publications/weber07high.ppt.pdf", abstract = "With the growing importance of high performance computing and, more importantly, the fast growing size of sophisticated high performance computing systems, research in the area of high availability is essential to meet the needs to sustain the current growth. This Master thesis project aims to improve the availability of Lustre. Major concern of this project is the metadata server of the file system. The metadata server of Lustre suffers from the last single point of failure in the file system. To overcome this single point of failure an active/active high availability approach is introduced. The new file system design with multiple MDS nodes running in virtual synchrony leads to a significant increase of availability. Two prototype implementations aim to show how the proposed system design and its new realized form of symmetric active/active high availability can be accomplished in practice. The results of this work point out the difficulties in adapting the file system to the active/active high availability design. Tests identify not achieved functionality and show performance problems of the proposed solution. The findings of this dissertation may be used for further work on high availability for distributed file systems." }
@mastersthesis{baumann06design, author = "Ronald Baumann", title = "Design and Development of Prototype Components for the {Harness} High-Performance Computing Workbench", month = mar # "~6, ", year = "2006", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Double diploma in conjunction with the \href{http://www.f1.fhtw-berlin.de}{Department of Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical College for Engineering and Economics (FHTW) Berlin}, Germany. Advisors: Prof. Vassil N. Alexandrov (University of Reading); George A. (Al) Geist and Christian Engelmann (Oak Ridge National Laboratory)", url = "http://www.christian-engelmann.info/publications/baumann06design.pdf", url2 = "http://www.christian-engelmann.info/publications/baumann06design.ppt.pdf", abstract = "This master thesis examines plug-in technology, especially the new field of parallel plug-ins. Plug-ins are popular because they extend the capabilities of software packages such as browsers and Photoshop, and allow an individual user to add new functionality. Parallel plug-ins also provide the above capabilities to a distributed set of resources, i.e., a plug-in now becomes a set of coordinating plug-ins. Second, the set of plugins may be heterogeneous either in function or because the underlying resources are heterogeneous. This new dimension of complexity provides a rich research space which is explored in this thesis. Experiences are collected and presented as parallel plug-in paradigms and concepts. The Harness framework was used in this project, in particular the plugin manager and available communication capabilities. Plug-ins provide methods for users to extend Harness according to their requirements. The result of this thesis is a parallel plug-in paradigm and template for Harness. Users of the Harness environment will be able to design and implement their applications in the form of parallel plug-ins easier and faster by using the paradigm resulting from this project. Prototypes were implemented which handle different aspects of parallel plug-ins. Parallel plug-in configurations were tested on an appropriate number of Harness kernels, including available communication and error-handling capabilities. Furthermore, research was done in the area of fault tolerance while parallel plug-ins are (un)loaded, as well as while a task is performed." }
@mastersthesis{uhlemann06high, author = "Kai Uhlemann", title = "High Availability for High-End Scientific Computing", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", month = mar # "~6, ", year = "2006", note = "Thesis research performed at Oak Ridge National Laboratory. Double diploma in conjunction with the \href{http://www.f1.fhtw-berlin.de}{Department of Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical College for Engineering and Economics (FHTW) Berlin}, Germany. Advisors: Prof. Vassil N. Alexandrov (University of Reading); George A. (Al) Geist and Christian Engelmann (Oak Ridge National Laboratory)", url = "http://www.christian-engelmann.info/publications/uhlemann06high.pdf", url2 = "http://www.christian-engelmann.info/publications/uhlemann06high.ppt.pdf", abstract = "With the growing interest and popularity in high performance cluster computing and, more importantly, the fast growing size of compute clusters, research in the area of high availability is essential to meet the needs to sustain the current growth. This Master thesis project introduces a new approach for high availability focusing on the head node of a cluster system. This projects focus is on providing high availability to the job scheduler service, which is the most vital part of the traditional Beowulf-style cluster architecture. This research seeks to add high availability to the job scheduler service and resource management system, typically running on the head node, leading to a significant increase of availability for cluster computing. Also, this software project takes advantage of the virtual synchrony paradigm to achieve active/active replication, the highest form of high availability. A proof-of-concept implementation shows how high availability can be designed in software and what results can be expected of such a system. The results may be reused for future or existing projects to further improve and extent the high availability of compute clusters." }
@phdthesis{engelmann08symmetric3, author = "Christian Engelmann", title = "Symmetric Active/Active High Availability for High-Performance Computing System Services", year = "2008", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Advisor: Prof. Vassil N. Alexandrov (University of Reading)", url = "http://www.christian-engelmann.info/publications/engelmann08symmetric3.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann08symmetric3.ppt.pdf", abstract = "In order to address anticipated high failure rates, reliability, availability and serviceability have become an urgent priority for next-generation high-performance computing (HPC) systems. This thesis aims to pave the way for highly available HPC systems by focusing on their most critical components and by reinforcing them with appropriate high availability solutions. Service components, such as head and service nodes, are the Achilles heel of a HPC system. A failure typically results in a complete system-wide outage. This thesis targets efficient software state replication mechanisms for service component redundancy to achieve high availability as well as high performance. Its methodology relies on defining a modern theoretical foundation for providing service-level high availability, identifying availability deficiencies of HPC systems, and comparing various service-level high availability methods. This thesis showcases several developed proof-of-concept prototypes providing high availability for services running on HPC head and service nodes using the symmetric active/active replication method, i.e., state-machine replication, to complement prior work in this area using active/standby and asymmetric active/active configurations. Presented contributions include a generic taxonomy for service high availability, an insight into availability deficiencies of HPC systems, and a unified definition of service-level high availability methods. Further contributions encompass a fully functional symmetric active/active high availability prototype for a HPC job and resource management service that does not require modification of service, a fully functional symmetric active/active high availability prototype for a HPC parallel file system metadata service that offers high performance, and two preliminary prototypes for a transparent symmetric active/active replication software framework for client-service and dependent service scenarios that hide the replication infrastructure from clients and services. Assuming a mean-time to failure of 5,000 hours for a head or service node, all presented prototypes improve service availability from 99.285\% to 99.995\% in a two-node system, and to 99.99996\% with three nodes." }
@mastersthesis{engelmann01distributed, author = "Christian Engelmann", title = "Distributed Peer-to-Peer Control for {Harness}", month = jul # "~7, ", year = "2001", school = "\href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Double diploma in conjunction with the \href{http://www.f1.fhtw-berlin.de}{Department of Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical College for Engineering and Economics (FHTW) Berlin}, Germany. Advisors: Prof. Vassil N. Alexandrov (University of Reading); George A. (Al) Geist (Oak Ridge National Laboratory)", url = "http://www.christian-engelmann.info/publications/engelmann01distributed.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann01distributed.ppt.pdf", abstract = "Parallel processing, the method of cutting down a large computational problem into many small tasks which are solved in parallel, is a field of increasing importance in science. Cost-effective, flexible and efficient simulations of mathematical models of physical, chemical or biological real-world problems are replacing the traditional experimental research. Current software solutions for parallel and scientific computation, like Parallel Virtual Machine and Message Passing Interface, have limitations in handling faults and failures, in utilizing heterogeneous and dynamically changing communication structures, and in enabling migrating or cooperative applications. The current research in heterogeneous adaptable reconfigurable networked systems (Harness) aims to produce the next generation of software solutions for distributed computing. A high-available and light-weighted distributed virtual machine service provides an encapsulation of a few hundred to a few thousand physical machines in a virtual heterogeneous large scale cluster. A high availability of a service in distributed systems can be achieved by replication of the service state on multiple server processes. If one ore more server processes fails, the surviving ones continue to provide the service because they know the state. Since every member of a distributed virtual machine is part of the distributed virtual machine service state and is able to change this state, a distributed control is needed to replicate the state and maintain its consistency. This distributed control manages state changes as well as the state-replication and the detection of and recovery from faults and failures of server processes. This work analyzes system architectures currently used in heterogeneous distributed computing by defining terms, conditions and assumptions. It shows that such systems are asynchronous and may use partially synchronous communication to detect and to distinguish different classes of faults and failures. It describes how a high availability of a large scale distributed service on a huge number of servers residing on different geographical locations can be realized. Asynchronous group communication services, such as Reliable Broadcast, Atomic Broadcast, Distributed Agreement and Membership, are analyzed to develop linear scalable algorithms in an unidirectional and in a bidirectional connected asynchronous peer-to-peer ring architecture. A Transaction Control group communication service is introduced as state-replication service. The system analysis distinguishes different types of distributed systems, where active transactions execute state changes using non-replicated data of one or more servers and inactive transactions report state changes using replicated data only. It is applicable for passive fault-tolerant distributed databases as well as for active fault-tolerant distributed control mechanisms. No control token is used and time stamps are avoided, so that all members of a server group have equal responsibilities and are independent from the system time. A prototype which implements the most complicated Transaction Control algorithm is realized due to the complexity of the distributed system and the early development stage of the introduced algorithms. The prototype is used to obtain practical experience with the state-replication algorithm." }
@mastersthesis{engelmann01distributed2, author = "Christian Engelmann", title = "Distributed Peer-to-Peer Control for {Harness}", month = feb # "~23, ", year = "2001", school = "\href{http://www.f1.fhtw-berlin.de}{Department of Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical College for Engineering and Economics (FHTW) Berlin}, Germany", note = "Thesis research performed at Oak Ridge National Laboratory. Double diploma in conjunction with the \href{http://www.cs.reading.ac.uk}{Department of Computer Science}, \href{http://www.reading.ac.uk}{University of Reading}, UK. Advisors: Prof. Uwe Metzler (Technical College for Engineering and Economics (FHTW) Berlin); George A. (Al) Geist (Oak Ridge National Laboratory)", url = "http://www.christian-engelmann.info/publications/engelmann01distributed2.pdf", url2 = "http://www.christian-engelmann.info/publications/engelmann01distributed2.ppt.pdf", abstract = "Parallel processing, the method of cutting down a large computational problem into many small tasks which are solved in parallel, is a field of increasing importance in science. Cost-effective, flexible and efficient simulations of mathematical models of physical, chemical or biological real-world problems are replacing the traditional experimental research. Current software solutions for parallel and scientific computation, like Parallel Virtual Machine and Message Passing Interface, have limitations in handling faults and failures, in utilizing heterogeneous and dynamically changing communication structures, and in enabling migrating or cooperative applications. The current research in heterogeneous adaptable reconfigurable networked systems (Harness) aims to produce the next generation of software solutions for distributed computing. A high-available and light-weighted distributed virtual machine service provides an encapsulation of a few hundred to a few thousand physical machines in a virtual heterogeneous large scale cluster. A high availability of a service in distributed systems can be achieved by replication of the service state on multiple server processes. If one ore more server processes fails, the surviving ones continue to provide the service because they know the state. Since every member of a distributed virtual machine is part of the distributed virtual machine service state and is able to change this state, a distributed control is needed to replicate the state and maintain its consistency. This distributed control manages state changes as well as the state-replication and the detection of and recovery from faults and failures of server processes. This work analyzes system architectures currently used in heterogeneous distributed computing by defining terms, conditions and assumptions. It shows that such systems are asynchronous and may use partially synchronous communication to detect and to distinguish different classes of faults and failures. It describes how a high availability of a large scale distributed service on a huge number of servers residing on different geographical locations can be realized. Asynchronous group communication services, such as Reliable Broadcast, Atomic Broadcast, Distributed Agreement and Membership, are analyzed to develop linear scalable algorithms in an unidirectional and in a bidirectional connected asynchronous peer-to-peer ring architecture. A Transaction Control group communication service is introduced as state-replication service. The system analysis distinguishes different types of distributed systems, where active transactions execute state changes using non-replicated data of one or more servers and inactive transactions report state changes using replicated data only. It is applicable for passive fault-tolerant distributed databases as well as for active fault-tolerant distributed control mechanisms. No control token is used and time stamps are avoided, so that all members of a server group have equal responsibilities and are independent from the system time. A prototype which implements the most complicated Transaction Control algorithm is realized due to the complexity of the distributed system and the early development stage of the introduced algorithms. The prototype is used to obtain practical experience with the state-replication algorithm." }