<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
	xmlns:content="http://purl.org/rss/1.0/modules/content/"
	xmlns:wfw="http://wellformedweb.org/CommentAPI/"
	xmlns:dc="http://purl.org/dc/elements/1.1/"
	xmlns:atom="http://www.w3.org/2005/Atom"
	xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
	xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
	>

<channel>
	<title>Christian Engelmann, Ph.D.</title>
	<atom:link href="http://www.christian-engelmann.info/?feed=rss2" rel="self" type="application/rss+xml" />
	<link>http://www.christian-engelmann.info</link>
	<description>System Software Team Task Lead / R&#38;D Staff Scientist, Oak Ridge National Laboratory</description>
	<lastBuildDate>Sun, 19 May 2013 23:55:50 +0000</lastBuildDate>
	<language>en-US</language>
	<sy:updatePeriod>hourly</sy:updatePeriod>
	<sy:updateFrequency>1</sy:updateFrequency>
	<generator>http://wordpress.org/?v=3.5.1</generator>
		<item>
		<title>About Me</title>
		<link>http://www.christian-engelmann.info/?p=1</link>
		<comments>http://www.christian-engelmann.info/?p=1#comments</comments>
		<pubDate>Thu, 04 Apr 2013 04:00:24 +0000</pubDate>
		<dc:creator>engelmannc</dc:creator>
				<category><![CDATA[Uncategorized]]></category>

		<guid isPermaLink="false">https://shadow.dyndns.info/?p=1</guid>
		<description><![CDATA[Dr. Christian Engelmann is the Task Lead of the System Software Team in the Computer Science Research Group of the Computer Science and Mathematics Division at Oak Ridge National Laboratory, which is the U.S. Department of Energy&#8217;s largest multiprogram science and technology laboratory with an annual budget of $1.6 billion. He has 12 years experience [...]]]></description>
				<content:encoded><![CDATA[<p><img src="images/christian_engelmann.png" hspace="5" vspace="3" height="115" align="left">Dr. Christian Engelmann is the Task Lead of the System Software Team in the <a href="http://www.csm.ornl.gov/newsite/network_cluster.html" target="www.csm.ornl.gov_newsite_network_cluster.html">Computer Science Research Group</a> of the <a href="http://www.csm.ornl.gov" target="www.csm.ornl.gov">Computer Science and Mathematics Division</a> at <a href="http://www.ornl.gov" target="www.ornl.gov">Oak Ridge National Laboratory</a>, which is the <a href="http://science.energy.gov/ascr" target="science.energy.gov_ascr">U.S. Department of Energy&#8217;s</a> largest multiprogram science and technology laboratory with an annual budget of $1.6 billion. He has 12 years experience in software research and development (R&#038;D) for extreme-scale high-performance computing (HPC) systems with a strong research funding and publication record. In collaboration with other laboratories and universities, his research aims at solving computer science challenges in HPC software, such as scalability, dependability, energy efficiency, and portability, for the largest current and future supercomputers in the world. Dr. Engelmann&#8217;s primary expertise is in HPC resilience, i.e., providing efficiency and correctness in the presence of faults, errors, and failures through avoidance, masking, and recovery. As chair and member of several scientific committees and panels, including the U.S. DOE Technical Council on Resilience, he is a leading expert in the HPC resilience community. The term HPC resilience was coined in a co-authored whitepaper in 2009. Dr. Engelmann&#8217;s secondary expertise is in HPC hardware/software co-design through lightweight simulation of future-generation extreme-scale systems with up to 134,217,728 (2^27) processor cores, studying the impact of hardware, system software, and parallel application properties on the key HPC system design factors: performance, resilience, and power consumption. His skills further include leading R&#038;D teams, co-advising students, programming in C/C++, MPI, Fortran, and Java, and system administration.</p>
<p><i><a href="http://shadow.dyndns.info/engelmann.pdf">Download</a> NSF-style 2-page bio. <a href="http://shadow.dyndns.info/publications.pdf">Download</a> full list of publications. Resume available upon <a href="mailto:engelmannc@computer.org">request</a>.</i></p>
<h4>Contact Information</h4>
<p><img src="images/qr.png" align="right" width="100" height="100" border="0" alt="QR Code"></p>
<table width="85%" style="border:0pt; padding:0pt;">
<tr>
<td width="47%" style="border:0pt; padding:0pt;">
e-Mail: <a href="mailto:engelmannc@computer.org">engelmannc@computer.org</a><br />
Mail: P.O. Box 2008, Oak Ridge, TN 37831-6016, USA
</td>
<td width="26%" style="border:0pt; padding:0pt;">
Phone: +1 (865) 574-3132<br />
Fax: +1 (865) 576-5491
</td>
<td width="15%" style="border:0pt; padding:0pt;">
<a href="https://www.xing.com/profile/Christian_Engelmann7" target="www.xing.com_profile_Christian_Engelmann7"><img src="images/xing.png" width="80" height="15" border="0"></a><br />
<a href="http://www.linkedin.com/in/christianengelmann" target="www.linkedin.com_in_christianengelmann"><img src="images/linkedin.gif" width="80" height="15" border="0" alt="View Christian Engelmann's profile on LinkedIn"></a>
</td>
<td width="12%" style="border:0pt; padding:0pt;">
<a href="https://www.facebook.com/engelmannchr" target="www.facebook.com_engelmannchr"><img src="images/facebook.jpg" width="92" height="35" border="0" alt="View Christian Engelmann's profile on facebook"></a>
</td>
<tr>
</table>
<h4>Professional Accomplishments</h4>
<table width="87%" style="border:0pt; padding:0pt;">
<tr>
<td style="border:0pt; padding:0pt;">8 Research grants ($15.6M, 2 as lead-PI)</td>
<td style="border:0pt; padding:0pt;">7 <a href="http://shadow.dyndns.info/?page_id=99">Peer-reviewed journal articles</a></td>
<td style="border:0pt; padding:0pt;">36 Invited <a href="http://shadow.dyndns.info/?page_id=122">talks and seminars</a></td>
</tr>
<tr>
<td style="border:0pt; padding:0pt;">8 <a href="http://shadow.dyndns.info/?page_id=124">Co-advised Master theses</a></td>
<td style="border:0pt; padding:0pt;">34 <a href="http://shadow.dyndns.info/?page_id=109">Peer-reviewed conference papers</a></td>
<td style="border:0pt; padding:0pt;">83 <a href="http://shadow.dyndns.info/?page_id=288#committees">Committees</a> at 36 <a href="http://shadow.dyndns.info/?page_id=288#committees">conference series</a></td>
</tr>
<tr>
<td style="border:0pt; padding:0pt;">2 Mentored summer faculty</a></td>
<td style="border:0pt; padding:0pt;">27 <a href="http://shadow.dyndns.info/?page_id=1125">Peer-reviewed workshop papers</a></td>
<td style="border:0pt; padding:0pt;">27 <a href="http://shadow.dyndns.info/?page_id=288#reviews">Article and book proposal reviews</a></td>
</tr>
<tr>
<td style="border:0pt; padding:0pt;">10 Direct reports over the past 7 years</a></td>
<td style="border:0pt; padding:0pt;">8 <a href="http://shadow.dyndns.info/?page_id=116">Peer-reviewed conference posters</a></td>
<td style="border:0pt; padding:0pt;">11 <a href="http://shadow.dyndns.info/?page_id=288#exhibitions">Conference booth exhibitions</a></td>
</tr>
<tr>
<td style="border:0pt; padding:0pt;"><a href="http://www.oakland.edu/enp" target="www.oakland.edu_enp">Erdős number</a> of 5</td>
<td style="border:0pt; padding:0pt;"><a href="http://www.harzing.com/pop.htm" target="hindex">910+ Total publication citations</a></td>
<td style="border:0pt; padding:0pt;"><a href="http://en.wikipedia.org/wiki/H-index" target="en.wikipedia.org_wiki__H-index">H-index</a> of <a href="http://www.harzing.com/pop.htm" target="hindex">16</a> / <a href="http://en.wikipedia.org/wiki/G-index" target="en.wikipedia.org_wiki__G-index">G-index</a> of <a href="http://www.harzing.com/pop.htm" target="hindex">27</a></td>
</tr>
</table>
<h4>Ongoing Research Activities</h4>
<p>
<b>2012-&#8230;:</b> HPC resilience co-design toolkit evaluating the resilience/power/performance cost/benefit trade-off of resilience solutions, identifying hardware/software resilience properties, and coordinating interfaces/responsibilities of individual hardware/software components <a href="?page_id=1443"><i>&#8230; more</i></a>
</p>
<h4>Upcoming Deadlines</h4>
<p>
<b>2013-09-01:</b> <a href="http://www.csm.ornl.gov/srt/conferences/Scala/2013" target="www.csm.ornl.gov_srt_conferences_Scala_2013">4th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA) 2013</a> at the 26th IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC), Denver, CO, USA , November 18, 2013.<br />
<b>2013-05-31:</b> <a href="http://xcr.cenit.latech.edu/resilience2013/" target="xcr.cenit.latech.edu_resilience2013">6th Workshop on Resiliency in High-Performance Computing (Resilience) 2013</a> at the 19th International European Conference on Parallel and Distributed Computing (Euro-Par), Aachen, Germany, August 26-30, 2013.
</p>
<h4>Recent Events</h4>
<p>
<b>2013-02-12:</b> Presentation of the sole-authored research paper, <a href="?page_id=109">Investigating Operating System Noise in Extreme-Scale High-Performance Computing Systems using Simulation</a>, at the <a href="http://www.iasted.org/conferences/home-795.html" target="www.iasted.org/conferences/home-795.html">11th IASTED International Conference on Parallel and Distributed Computing and Networks (PDCN) 2013</a>, Innsbruck, Austria.<br />
<b>2013-01-31:</b> First meeting of the Technical Council on Resilience for the <a href="http://science.energy.gov/ascr" target="science.energy.gov_ascr">Advanced Scientific Computing Research Program</a> at the <a href="http://science.energy.gov" target="science.energy.gov">Office of Science</a> of the <a href="http://energy.gov" target="energy.gov">U.S. Department of Energy</a>, Germantown, MD, USA.<br />
<b>2012-11-15:</b> Chair of the Birds-of-a-Feather session on Resilience for Extreme-scale High-performance Computing at the <a href="http://sc12.supercomputing.org" target="sc12.supercomputing.org">25th IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2012</a>, Salt Lake City, UT, USA.<br />
<b>2012-11-15:</b> Presentation by David Fiala of the co-authored research paper, <a href="?page_id=109">Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing</a>, at the <a href="http://sc12.supercomputing.org" target="sc12.supercomputing.org">25th IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2012</a>, Salt Lake City, UT, USA.<br />
<b>2012-11-12:</b> After its recent upgrade to 261,632 NVIDIA K20x accelerator cores and 298,592 AMD Opteron cores, <a href="http://www.olcf.ornl.gov/titan" target="www.olcf.ornl.gov_titan">ORNL&#8217;s Titan Cray XK7 supercomputer</a> is ranked <a href="http://www.top500.org/lists/2012/11" target="www.top500.org_lists_2012_11">1st in the Top 500 List</a> of supercomputers with a LINPACK performance of 17.95 PFlops and <a href="http://www.green500.org/lists/green201211" target="www.green500.org_lists_green201211">3rd in the Green 500 List</a> of energy-efficient supercomputers.<br />
<b>2012-11-11:</b> Program Chair of the <a href="http://www.csm.ornl.gov/srt/conferences/Scala/2012" target="www.csm.ornl.gov/srt/conferences/Scala/2012">Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA)</a> at the <a href="http://sc12.supercomputing.org" target="sc12.supercomputing.org">25th IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2012</a>, Salt Lake City, UT, USA.
</p>
<h4>Important Peer-reviewed Journal Publications</h4>
<p><em><small>Symbols: <img style="border-style: none;" src="images/txt.gif" border="0" alt="Abstract" height="10pt"> Abstract, <img style="border-style: none;" src="images/pdf.gif" border="0" alt="Publication" height="10pt"> Publication, <img style="border-style: none;" src="images/bib.gif" border="0" alt="BibTeX Citation" height="10pt"> BibTeX Citation, <img src="images/doi.gif" border="0" style="border-style:none" alt="DOI Link" height="10pt"> DOI Link</small></em></p>
<div class="list">
<ol>
<li>Christian Engelmann. <b>Scaling To A Million Cores And Beyond: Using Light-Weight Simulation to Understand The Challenges Ahead On The Road To Exascale</b>. <i><a href="http://www.elsevier.com/locate/fgcs" target="www.elsevier.com/locate/fgcs">Future Generation Computer Systems (FGCS)</a></i>, 2013. <a href="http://www.elsevier.com" target="www.elsevier.com">Elsevier B.V, Amsterdam, The Netherlands</a>. To appear. <a href="javascript:showAbstract('As supercomputers scale to 1,000 PFlop/s over the next decade, investigating the performance of parallel applications at scale on future architectures and the performance impact of different architecture choices for high-performance computing (HPC) hardware/software co-design is crucial. This paper summarizes recent efforts in designing and implementing a novel HPC hardware/software co-design toolkit. The presented Extreme-scale Simulator (xSim) permits running an HPC application in a controlled environment with millions of concurrent execution threads while observing its performance in a simulated extreme-scale HPC system using architectural models and virtual timing. This paper demonstrates the capabilities and usefulness of the xSim performance investigation toolkit, such as its scalability to 2^27 simulated Message Passing Interface (MPI) ranks on 960 real processor cores, the capability to evaluate the performance of different MPI collective communication algorithms, and the ability to evaluate the performance of a basic Monte Carlo application with different architectural parameters.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="?page_id=26#engelmann13scaling"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a></li>
<li>Chao Wang, Frank Mueller, Christian Engelmann, and Stephen L. Scott. <b>Proactive Process-Level Live Migration and Back Migration in HPC Environments</b>. <i><a href="http://www.elsevier.com/locate/jpdc" target="www.elsevier.com/locate/jpdc">Journal of Parallel and Distributed Computing (JPDC)</a></i>, volume 72, number 2, pages 254-267, 2012. <a href="http://www.elsevier.com" target="www.elsevier.com">Elsevier B.V, Amsterdam, The Netherlands</a>. ISSN 0743-7315. <a href="javascript:showAbstract('As the number of nodes in high-performance computing environments keeps increasing, faults are becoming common place. Reactive fault tolerance (FT) often does not scale due to massive I/O requirements and relies on manual job resubmission. This work complements reactive with proactive FT at the process level. Through health monitoring, a subset of node failures can be anticipated when one&amp;#39;s health deteriorates. A novel process-level live migration mechanism supports continued execution of applications during much of process migration. This scheme is integrated into an MPI execution environment to transparently sustain health-inflicted node failures, which eradicates the need to restart and requeue MPI jobs. Experiments indicate that 1-6.5 s of prior warning are required to successfully trigger live process migration while similar operating system virtualization mechanisms require 13-24 s. This self-healing approach complements reactive FT by nearly cutting the number of checkpoints in half when 70% of the faults are handled proactively. The work also provides a novel back migration approach to eliminate load imbalance or bottlenecks caused by migrated tasks. Experiments indicate the larger the amount of outstanding execution, the higher the benefit due to back migration.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="http://www.christian-engelmann.info/publications/wang12proactive.pdf" target="publication"><img src="images/pdf.gif" border="0" style="border-style:none" height="10pt" alt="Publication"></a> <a href="?page_id=26#wang12proactive"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a> <a href="http://dx.doi.org/10.1016/j.jpdc.2011.10.009" target="publication"><img src="images/doi.gif" border="0" style="border-style:none" height="10pt" alt="DOI Link"></a></li>
<li>Xubin (Ben) He, Li Ou, Christian Engelmann, Xin Chen, and Stephen L. Scott. <b>Symmetric Active/Active Metadata Service for High Availability Parallel File Systems</b>. <i><a href="http://www.elsevier.com/locate/jpdc" target="www.elsevier.com/locate/jpdc">Journal of Parallel and Distributed Computing (JPDC)</a></i>, volume 69, number 12, pages 961-973, 2009. <a href="http://www.elsevier.com" target="www.elsevier.com">Elsevier B.V, Amsterdam, The Netherlands</a>. ISSN 0743-7315. <a href="javascript:showAbstract('High availability data storage systems are critical for many applications as research and business become more data-driven. Since metadata management is essential to system availability, multiple metadata services are used to improve the availability of distributed storage systems. Past research focused on the active/standby model, where each active service has at least one redundant idle backup. However, interruption of service and even some loss of service state may occur during a fail-over depending on the used replication technique. In addition, the replication overhead for multiple metadata services can be very high. The research in this paper targets the symmetric active/active replication model, which uses multiple redundant service nodes running in virtual synchrony. In this model, service node failures do not cause a fail-over to a backup and there is no disruption of service or loss of service state. We further discuss a fast delivery protocol to reduce the latency of the needed total order broadcast. Our prototype implementation shows that metadata service high availability can be achieved with an acceptable performance trade-off using our symmetric active/active metadata service solution.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="http://www.christian-engelmann.info/publications/he09symmetric.pdf" target="publication"><img src="images/pdf.gif" border="0" style="border-style:none" height="10pt" alt="Publication"></a> <a href="?page_id=26#he09symmetric"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a> <a href="http://dx.doi.org/10.1016/j.jpdc.2009.08.004" target="publication"><img src="images/doi.gif" border="0" style="border-style:none" height="10pt" alt="DOI Link"></a></li>
<li>Christian Engelmann, Stephen L. Scott, Chokchai (Box) Leangsuksun, and Xubin (Ben) He. <b>Symmetric Active/Active High Availability for High-Performance Computing System Services</b>. <i><a href="http://www.academypublisher.com/jcp" target="www.academypublisher.com/jcp">Journal of Computers (JCP)</a></i>, volume 1, number 8, pages 43-54, 2006. <a href="http://www.academypublisher.com" target="www.academypublisher.com">Academy Publisher, Oulu, Finland</a>. ISSN 1796-203X. <a href="javascript:showAbstract('This work aims to pave the way for high availability in high-performance computing (HPC) by focusing on efficient redundancy strategies for head and service nodes. These nodes represent single points of failure and control for an entire HPC system as they render it inaccessible and unmanageable in case of a failure until repair. The presented approach introduces two distinct replication methods, internal and external, for providing symmetric active/active high availability for multiple redundant head and service nodes running in virtual synchrony utilizing an existing process group communication system for service group membership management and reliable, totally ordered message delivery. Resented results of a prototype implementation that offers symmetric active/active replication for HPC job and resource management using external replication show that the highest level of availability can be provided with an acceptable performance trade-off.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="http://www.christian-engelmann.info/publications/engelmann06symmetric.pdf" target="publication"><img src="images/pdf.gif" border="0" style="border-style:none" height="10pt" alt="Publication"></a> <a href="?page_id=26#engelmann06symmetric"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a> <a href="http://www.academypublisher.com/jcp/vol01/no08/jcp01084354.html" target="publication"><img src="images/doi.gif" border="0" style="border-style:none" height="10pt" alt="DOI Link"></a></li>
</ol>
</div>
<h4>Important Peer-reviewed Conference Publications</h4>
<p><em><small>Symbols: <img style="border-style: none;" src="images/txt.gif" border="0" alt="Abstract" height="10pt"> Abstract, <img style="border-style: none;" src="images/pdf.gif" border="0" alt="Publication" height="10pt"> Publication, <img style="border-style: none;" src="images/ppt.gif" border="0" alt="Presentation" height="10pt"> Presentation, <img style="border-style: none;" src="images/bib.gif" border="0" alt="BibTeX Citation" height="10pt"> BibTeX Citation, <img style="border-style: none;" src="images/doi.gif" border="0" alt="DOI Link" height="10pt"> DOI Link</small></em></p>
<div class="list">
<ol>
<li>David Fiala, Frank Mueller, Christian Engelmann, Kurt Ferreira, Ron Brightwell, and Rolf Riesen. <b>Detection and Correction of Silent Data Corruption for Large-Scale High-Performance Computing</b>. In <i>Proceedings of the <a href="http://sc12.supercomputing.org" target="sc12.supercomputing.org">25th IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2012</a></i>, pages 78:1-78:12, Salt Lake City, UT, USA, November 10-16, 2012. <a href="http://www.acm.org" target="www.acm.org">ACM Press, New York, NY, USA</a>. ISBN 978-1-4673-0804-5. Acceptance rate 21.2% (100/472). <a href="javascript:showAbstract('Faults have become the norm rather than the exception for high-end computing on clusters with 10s/100s of thousands of cores. Exacerbating this situation, some of these faults remain undetected, manifesting themselves as silent errors that corrupt memory while applications continue to operate and report incorrect results. This paper studies the potential for redundancy to both detect and correct soft errors in MPI message-passing applications. Our study investigates the challenges inherent to detecting soft errors within MPI application while providing transparent MPI redundancy. By assuming a model wherein corruption in application data manifests itself by producing differing MPI message data between replicas, we study the best suited protocols for detecting and correcting MPI data that is the result of corruption. To experimentally validate our proposed detection and correction protocols, we introduce RedMPI, an MPI library which resides in the MPI profiling layer. RedMPI is capable of both online detection and correction of soft errors that occur in MPI applications without requiring any modifications to the application source by utilizing either double or triple redundancy. Our results indicate that our most efficient consistency protocol can successfully protect applications experiencing even high rates of silent data corruption with runtime overheads between 0% and 30% as compared to unprotected applications without redundancy. Using our fault injector within RedMPI, we observe that even a single soft error can have profound effects on running applications, causing a cascading pattern of corruption in most cases causes that spreads to all other processes. RedMPI&amp;#39;s protection has been shown to successfully mitigate the effects of soft errors while allowing applications to complete with correct results even in the face of errors.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="http://www.christian-engelmann.info/publications/fiala12detection2.pdf" target="publication"><img src="images/pdf.gif" border="0" style="border-style:none" height="10pt" alt="Publication"></a> <a href="http://www.christian-engelmann.info/publications/fiala12detection2.ppt.pdf" target="publication"><img src="images/ppt.gif" border="0" style="border-style:none" height="10pt" alt="Presentation"></a> <a href="?page_id=26#fiala12detection2"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a></li>
<li>James Elliott, Kishor Kharbas, David Fiala, Frank Mueller, Kurt Ferreira, and Christian Engelmann. <b>Combining Partial Redundancy and Checkpointing for HPC</b>. In <i>Proceedings of the <a href="http://icdcs-2012.org/" target="icdcs-2012.org/">32nd International Conference on Distributed Computing Systems (ICDCS) 2012</a></i>, pages 615-626, Macau, SAR, China, June 18-21, 2012. <a href="http://www.computer.org" target="www.computer.org">IEEE Computer Society, Los Alamitos, CA, USA</a>. ISBN 978-0-7695-4685-8. ISSN 1063-6927. Acceptance rate 13% (71/515). <a href="javascript:showAbstract('Today&amp;#39;s largest High Performance Computing (HPC) systems exceed one Petaflops (10^15 floating point operations per second) and exascale systems are projected within seven years. But reliability is becoming one of the major challenges faced by exascale computing. With billion-core parallelism, the mean time to failure is projected to be in the range of minutes or hours instead of days. Failures are becoming the norm rather than the exception during execution of HPC applications. Current fault tolerance techniques in HPC focus on reactive ways to mitigate faults, namely via checkpoint and restart (C/R). Apart from storage overheads, C/R-based fault recovery comes at an additional cost in terms of application performance because normal execution is disrupted when checkpoints are taken. Studies have shown that applications running at a large scale spend more than 50% of their total time saving checkpoints, restarting and redoing lost work. Redundancy is another fault tolerance technique, which employs redundant processes performing the same task. If a process fails, a replica of it can take over its execution. Thus, redundant copies can decrease the overall failure rate. The downside of redundancy is that extra resources are required and there is an additional overhead on communication and synchronization. This work contributes a model and analyzes the benefit of C/R in coordination with redundancy at different degrees to minimize the total wallclock time and resources utilization of HPC applications. We further conduct experiments with an implementation of redundancy within the MPI layer on a cluster. Our experimental results confirm the benefit of dual and triple redundancy - but not for partial redundancy - and show a close fit to the model. At 80,000 processes, dual redundancy requires twice the number of processing resources for an application but allows two jobs of 128 hours wallclock time to finish within the time of just one job without redundancy. For narrow ranges of processor counts, partial redundancy results in the lowest time. Once the count exceeds 770, 000, triple redundancy has the lowest overall cost. Thus, redundancy allows one to trade-off additional resource requirements against wallclock time, which provides a tuning knob for users to adapt to resource availabilities.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="http://www.christian-engelmann.info/publications/elliott12combining.pdf" target="publication"><img src="images/pdf.gif" border="0" style="border-style:none" height="10pt" alt="Publication"></a> <a href="http://www.christian-engelmann.info/publications/elliott12combining.ppt.pdf" target="publication"><img src="images/ppt.gif" border="0" style="border-style:none" height="10pt" alt="Presentation"></a> <a href="?page_id=26#elliott12combining"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a> <a href="http://dx.doi.org/10.1109/ICDCS.2012.56" target="publication"><img src="images/doi.gif" border="0" style="border-style:none" height="10pt" alt="DOI Link"></a></li>
<li>Chao Wang, Sudharshan S. Vazhkudai, Xiaosong Ma, Fei Meng, Youngjae Kim, and Christian Engelmann. <b>NVMalloc: Exposing an Aggregate SSD Store as a Memory Partition in Extreme-Scale Machines</b>. In <i>Proceedings of the <a href="http://www.ipdps.org" target="www.ipdps.org">26th IEEE International Parallel and Distributed Processing Symposium (IPDPS) 2012</a></i>, pages 957-968, Shanghai, China, May 21-25, 2012. <a href="http://www.computer.org" target="www.computer.org">IEEE Computer Society, Los Alamitos, CA, USA</a>. ISBN 978-0-7695-4675-9. Acceptance rate 21% (118/569). <a href="javascript:showAbstract('DRAM is a precious resource in extreme-scale machines and is increasingly becoming scarce, mainly due to the growing number of cores per node. On future multi-petaflop and exaflop machines, the memory pressure is likely to be so severe that we need to rethink our memory usage models. Fortunately, the advent of non-volatile memory (NVM) offers a unique opportunity in this space. Current NVM offerings possess several desirable properties, such as low cost and power efficiency, but also suffer from high latency and lifetime issues. We need rich techniques to be able to use them alongside DRAM. In this paper, we propose a novel approach to exploiting NVM as a secondary memory partition so that applications can explicitly allocate and manipulate memory regions therein. More specifically, we propose an NVMalloc library with a suite of services that enables applications to access a distributed NVM storage system. We have devised ways within NVMalloc so that the storage system, built from compute node-local NVM devices, can be accessed in a byte-addressable fashion using the memory mapped I/O interface. Our approach has the potential to re-energize out-of-core computations on large-scale machines by having applications allocate certain variables through NVMalloc, thereby increasing the overall memory available for the application. Our evaluation on a 128-core cluster shows that NVMalloc enables applications to compute problem sizes larger than the physical memory in a cost-effective manner. It can achieve better performance with increased computation time between NVM memory accesses or increased data access locality. In addition, our results suggest that while NVMalloc enables transparent access to NVM-resident variables, the explicit control it provides is crucial to optimize application performance.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="http://www.christian-engelmann.info/publications/wang12nvmalloc.pdf" target="publication"><img src="images/pdf.gif" border="0" style="border-style:none" height="10pt" alt="Publication"></a> <a href="http://www.christian-engelmann.info/publications/wang12nvmalloc.ppt.pdf" target="publication"><img src="images/ppt.gif" border="0" style="border-style:none" height="10pt" alt="Presentation"></a> <a href="?page_id=26#wang12nvmalloc"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a> <a href="http://dx.doi.org/10.1109/IPDPS.2012.90" target="publication"><img src="images/doi.gif" border="0" style="border-style:none" height="10pt" alt="DOI Link"></a></li>
<li>Swen B&ouml;hm and Christian Engelmann. <b>xSim: The Extreme-Scale Simulator</b>. In <i>Proceedings of the <a href="http://hpcs11.cisedu.info" target="hpcs11.cisedu.info">International Conference on High Performance Computing and Simulation (HPCS) 2011</a></i>, pages 280-286, Istanbul, Turkey, July 4-8, 2011. <a href="http://www.computer.org" target="www.computer.org">IEEE Computer Society, Los Alamitos, CA, USA</a>. ISBN 978-1-61284-383-4. Acceptance rate 28.1% (48/171). <a href="javascript:showAbstract('Investigating parallel application performance properties at scale is becoming an important part of high-performance computing (HPC) application development and deployment. The Extreme-scale Simulator (xSim) is a performance investigation toolkit that permits running an application in a controlled environment at extreme scale without the need for a respective extreme-scale HPC system. Using a lightweight parallel discrete event simulation, xSim executes a parallel application with a virtual wall clock time, such that performance data can be extracted based on a processor model and a network model. This paper presents significant enhancements to the xSim toolkit prototype that provide a more complete Message Passing Interface (MPI) support and improve its versatility. These enhancements include full virtual MPI group, communicator and collective communication support, and global variables support. The new capabilities are demonstrated by executing the entire NAS Parallel Benchmark suite in a simulated HPC environment.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="http://www.christian-engelmann.info/publications/boehm11xsim.pdf" target="publication"><img src="images/pdf.gif" border="0" style="border-style:none" height="10pt" alt="Publication"></a> <a href="http://www.christian-engelmann.info/publications/boehm11xsim.ppt.pdf" target="publication"><img src="images/ppt.gif" border="0" style="border-style:none" height="10pt" alt="Presentation"></a> <a href="?page_id=26#boehm11xsim"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a> <a href="http://dx.doi.org/10.1109/HPCSim.2011.5999835" target="publication"><img src="images/doi.gif" border="0" style="border-style:none" height="10pt" alt="DOI Link"></a></li>
<li>Chao Wang, Frank Mueller, Christian Engelmann, and Stephen L. Scott. <b>Hybrid Checkpointing for MPI Jobs in HPC Environments</b>. In <i>Proceedings of the <a href="http://grid.sjtu.edu.cn/icpads10" target="grid.sjtu.edu.cn/icpads10">16th IEEE International Conference on Parallel and Distributed Systems (ICPADS) 2010</a></i>, pages 524-533, Shanghai, China, December 8-10, 2010. <a href="http://www.computer.org" target="www.computer.org">IEEE Computer Society, Los Alamitos, CA, USA</a>. ISBN 978-0-7695-4307-9. Acceptance rate 29.6% (77/188). <a href="javascript:showAbstract('As the core count in high-performance computing systems keeps increasing, faults are becoming common place. Check pointing addresses such faults but captures full process images even though only a subset of the process image changes between checkpoints. We have designed a hybrid check pointing technique for MPI tasks of high-performance applications. This technique alternates between full and incremental checkpoints: At incremental checkpoints, only data changed since the last checkpoint is captured. Our implementation integrates new BLCR and LAM/MPI features that complement traditional full checkpoints. This results in significantly reduced checkpoint sizes and overheads with only moderate increases in restart overhead. After accounting for cost and savings, benefits due to incremental checkpoints are an order of magnitude larger than overheads on restarts. We further derive qualitative results indicating an optimal balance between full/incremental checkpoints of our novel approach at a ratio of 1:9, which outperforms both always-full and always-incremental check pointing.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="http://www.christian-engelmann.info/publications/wang10hybrid2.pdf" target="publication"><img src="images/pdf.gif" border="0" style="border-style:none" height="10pt" alt="Publication"></a> <a href="http://www.christian-engelmann.info/publications/wang10hybrid2.ppt.pdf" target="publication"><img src="images/ppt.gif" border="0" style="border-style:none" height="10pt" alt="Presentation"></a> <a href="?page_id=26#wang10hybrid2"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a> <a href="http://dx.doi.org/10.1109/ICPADS.2010.48" target="publication"><img src="images/doi.gif" border="0" style="border-style:none" height="10pt" alt="DOI Link"></a></li>
<li>Min Li, Sudharshan S. Vazhkudai, Ali R. Butt, Fei Meng, Xiaosong Ma, Youngjae Kim, Christian Engelmann, and Galen Shipman. <b>Functional Partitioning to Optimize End-to-End Performance on Many-Core Architectures</b>. In <i>Proceedings of the <a href="http://sc10.supercomputing.org" target="sc10.supercomputing.org">23rd IEEE/ACM International Conference on High Performance Computing, Networking, Storage and Analysis (SC) 2010</a></i>, pages 1-12, New Orleans, LA, USA, November 13-19, 2010. <a href="http://www.acm.org" target="www.acm.org">ACM Press, New York, NY, USA</a>. ISBN 978-1-4244-7559-9. Acceptance rate 19.8% (50/253). <a href="javascript:showAbstract('Scaling computations on emerging massive-core supercomputers is a daunting task, which coupled with the significantly lagging system I/O capabilities exacerbates applications&amp;#39; end-to-end performance. The I/O bottleneck often negates potential performance benefits of assigning additional compute cores to an application. In this paper, we address this issue via a novel functional partitioning (FP) runtime environment that allocates cores to specific application tasks - checkpointing, de-duplication, and scientific data format transformation - so that the deluge of cores can be brought to bear on the entire gamut of application activities. The focus is on utilizing the extra cores to support HPC application I/O activities and also leverage solid-state disks in this context. For example, our evaluation shows that dedicating 1 core on an oct-core machine for checkpointing and its assist tasks using FP can improve overall execution time of a FLASH benchmark on 80 and  160 cores by 43.95% and 41.34%, respectively.');"><img src="images/txt.gif" border="0" style="border-style:none" height="10pt" alt="Abstract"></a> <a href="http://www.christian-engelmann.info/publications/li10functional.pdf" target="publication"><img src="images/pdf.gif" border="0" style="border-style:none" height="10pt" alt="Publication"></a> <a href="http://www.christian-engelmann.info/publications/li10functional.ppt.pdf" target="publication"><img src="images/ppt.gif" border="0" style="border-style:none" height="10pt" alt="Presentation"></a> <a href="?page_id=26#li10functional"><img src="images/bib.gif" border="0" style="border-style:none" height="10pt" alt="BibTeX Citation"></a> <a href="http://dx.doi.org/10.1109/SC.2010.28" target="publication"><img src="images/doi.gif" border="0" style="border-style:none" height="10pt" alt="DOI Link"></a></li>
</ol>
</div>
<p><script language="JavaScript">
function showAbstract (text) {
  var width  = 400;
  var height = 400;
  var left   = (screen.width  - width ) / 2;
  var top    = (screen.height - height) / 2;
  var win    = window.open('',
                           'Abstract',
                           'width='  + width  + ', ' + 
                           'height=' + height + ', ' +
                           'left='   + left   + ', ' +
                           'top='    + top    + ', ' +
                           'toolbar=no, '     +
                           'location=no, '    +
                           'directories=no, ' +
                           'status=no, '      +
                           'menubar=no, '     +
                           'copyhistory=no, ' +
                           'scrollbars=yes, ' +
                           'resizable=yes')
  win.document.write(text);
  win.document.close();
}
</script></p>
]]></content:encoded>
			<wfw:commentRss>http://www.christian-engelmann.info/?feed=rss2&#038;p=1</wfw:commentRss>
		<slash:comments>0</slash:comments>
		</item>
	</channel>
</rss>
