; Hand this in to: ece849-staff+hw@ece.cmu.edu @article{esa96_ariane_501, title = "Ariane 501 - Presentation of Inquiry Board Report (Summary)", organization = "ESA", year = "1996", url = "http://www.esa.int/esaCP/Pr_33_1996_p_EN.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{weinstock95_sift, author = "Weinstock, Charles B.", affiliation = "Carnegie Mellon University - SEI, Pittsburgh, PA, USA", title = "SIFT: System Design and Implementation", booktitle = "Fault-Tolerant Computing 1995, Highlights from Twenty-Five Years", organization = "FTCS", year = "1995", url = "http://ieeexplore-beta.ieee.org//iel3/3846/11214/00532607.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{cristian91understanding, author = "Flaviu Cristian", affiliation = "IBM, San Jose, CA, USA", title = "Understanding fault-tolerant distributed systems", journal = "Communications of the ACM", volume = "34", number = "2", pages = "56--78", year = "1991", abstract = "We propose a small number of basic concepts that can be used to explain the architecture of fault-tolerant distributed systems and we discuss a list of architectural issues that we find useful to consider when designing or examining such systems. For each issue we present known solutions and design alternatives, we discuss their relative merits and we give examples of systems which adopt one approach or the other. The aim is to introduce some order in the complex discipline of designing and understanding fault-tolerant distributed systems.", url = "http://portal.acm.org/citation.cfm?doid=102792.102801", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{gray90_a_census_of_tandem_system, author = "Jim Gray", affiliation = "Tandem Computers Inc., Cupertino", title = "A Census of Tandem System Availability Between 1985 and 1990", organization = "IEEE", year = "1990", volume = "39", number = "4", pages = "409--418", abstract = "Tandem computer systems are designed to be single-fault tolerant. This paper takes a census of customer outages reported to Tandem. The census shows a clear improvement in the reliability of hardware maintenance. It indicates that now (1990) software is the major source of reported outages (62%), followed by system operations (15%). This is a dramatic shift from the statistics in 1985. Even discounting systematic under-reporting of operations and environmental outages, the conclusion is clear: Hardware faults and hardware maintenance are no longer a major source of outages. As the other componets of the system become increasingly reliable, software necessarily becomes the dominant cause of outages. Achieving higher-availability requires: 1) improvement in software quality and software-fault tolerance, 2) simpler operations, and 3) tolerance of operational faults.", url = "http://ieeexplore-beta.ieee.org//iel1/24/2133/00058719.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Readings @Conference{hatton97, author = "L. Hatton", title = "Software failures-follies and fallacies, ", organization = "IEEE", year = "1997", volume = "43", number = "2", pages = "49--52", abstract = "Software failure is becoming a serious issue. Ariane 5 provided a recent spectacular example of how a simple mistake, entirely avoidable, was allowed to sneak through the software verification stage and cause an immensely expensive failure. However, it is not just the aerospace industry which suffers such traumas. Here, the author discusses some common misconceptions", url = "http://ieeexplore.ieee.org/iel1/2188/12713/00586152.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{hoyme92, author = "Hoyme, K.; Driscoll, K.", title = "Safebus,", inbook = "Digital Avionics Systems Conference, IEEE/AIAA 11th", year = "1992", pages = "68--73", abstract = "", url = "http://ieeexplore.ieee.org/iel2/1033/6983/00282179.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @misc{lee93_faults, author = "I. Lee and R. K. Iyer, ", title = "Faults, Symptoms, and Software Fault Tolerance in the Tandem GUARDIAN90 Operating System", year = "1993", pages = "20--29", abstract = "The authors present a measurement-based study of software failures and recovery in the Tandem GUARDIAN90 operating system using a collection of memory dump analyses of field software failures. They identify the effects of software faults on the processor state and trace the propagation of the effects to other areas of the system. They also evaluate the role of the defensive programming techniques and the software fault tolerance of the process pair mechanism implemented in the Tandem system. Results show that the Tandem system tolerates nearly 82% of reported field software faults, thus demonstrating the effectiveness of the system against software faults. Consistency checks made by the operating system detect 52% of software problems and prevent any error propagation in 31% of software problems. Results also show that 72% of reported field software failures are recurrences of known software faults and 70% of the recurrence groups have identical characteristics", url = "http://ieeexplore.ieee.org//iel3/4964/13650/00627304.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{leveson93, author = "N. Leveson and C. Turner,", title = "An Investigation of the Therac-25 Accidents,", journal = "IEEE Computer", volume = "26", number = "7", year = "1993", pages = "18--41", abstract = "Between June 1985 and January 1987, the Therac-25 medical electron accelerator was involved in six massive radiation overdoses. As a result, several people died and others were seriously injured. A detailed investigation of the factors involved in the software-related overdoses and attempts by users, manufacturers, and government agencies to deal with the accidents is presented. The authors demonstrate the complex nature of accidents and the need to investigate all aspects of system development and operation in order to prevent future accidents. The authors also present some lessons learned in terms of system engineering, software engineering, and government regulation of safety-critical systems containing software components", url = "http://ieeexplore.ieee.org/iel4/2/6812/00274940.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @conference{powell88, author = "Powell, D.; Bonn, G.; Seaton, D.; Verissimo, P.; Waeselynck, F.,", title = "The Delta-4 Approach to Dependability in Open Distributed Computing Systems ,", inbook = "Fault-Tolerant Computing, 1995, Highlights from Twenty-Five Years.", abstract = "As part of the European Strategic Programme for Research in Information Technology (ESPRIT), the Delta-4 project is seeking to define an open, fault-tolerant, distributed computing architecture. The authors present the overall Delta-4 framework for open, fault-tolerant, distributed computing systems and sketch the current implementation, which is based on a local area network with specific atomic multicasting and error-processing protocols for communicating between replicated software components. The system is used to demonstrate the various fault-tolerance techniques by a replicated database application", url = "http://ieeexplore-beta.ieee.org//iel3/3846/11214/00532612.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @conference{powell94, author = "D. Powell (LAAS-CNRS)", title = "Distributed Fault Tolerance Lessons Learnt from Delta-4", inbook = "Workshop on Fault-Tolerant Architectures", year = "1994", abstract = "", url = "http://citeseer.nj.nec.com/powell94distributed.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }