; Hand this in to: ece849-staff+hw@ece.cmu.edu ;Required Reading @article{azadmanesh00_omissive_faults, author = {M. H. Azadmanesh and R. M. Kieckhafer}, title = {Exploiting Omissive Faults in Synchronous Approximate Agreement}, journal = {IEEE Transactions on Computers}, volume = {49}, number = {10}, year = {2000}, pages = {1031--1042}, abstract = "In a fault-tolerant distributed system, it is often necessary for nonfaulty processes to agree on the value of a shared data item. The criterion of Approximate Agreement does not require processes to achieve exact agreement on a value; rather, they need only agree to within a predefined numerical tolerance. Approximate Agreement can be achieved through convergent voting algorithms. Previous research has studied convergent voting algorithms under mixed-mode or hybrid fault models, such as the Thambidurai and Park Hybrid fault model, comprised of three fault modes: asymmetric, symmetric, and benign. This paper makes three major contributions to the state of the art in fault-tolerant convergent voting. 1) We partition both the asymmetric and symmetric fault modes into disjoint omissive and transmissive submodes. The resulting five-mode hybrid fault model is a superset of previous hybrid fault models. 2) We present a new family of voting algorithms, called Omission Mean Subsequence Reduced (OMSR), which implicitly recognize and exploit omissive behavior in malicious faults while still maintaining full Byzantine fault tolerance. 3) We show that OMSR voting algorithms are more fault-tolerant than previous voting algorithms if any of the currently active faults is omissive.", url = "http://ieeexplore.ieee.org/iel5/12/19199/00888039.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{lamport85_sync_with_faults, author = "Lamport, Leslie, and Melliar-Smith", title = "Synchronizing Clocks in the Presence of Faults.", organization = "SRI International", journal = "Journal of the ACM", year = "1985", volume = "32", number = "1", pages = "53--78", url = "http://portal.acm.org/citation.cfm?doid=2455.2457", abstract = "Algorithms are described for maintaining clock synchrony in a distributed multiporcess system where each process has its own clock. These algorithms work in the presence of arbitrary clock or process failures, including ``two-faced clocks'' that present different values to different processes. Two of the algorithms require that fewer than one-third of the processes be faulty. A third algorithm works if fewer than half the processes are faulty, but requires digital signatures.", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{kiechafer88_maft, author = "R. Kiechafer, C. J. Walter, A. M. Finn, P. M. Thambidurai", title = "The MAFT Architecture for Distributed Fault Tolerance", journal = "IEEE Transactions on Computers", year = "1988", volume = "37", number = "4", abstract = "This paper describes the Multicomputer Architecture for Fault-Tolerance (MAFT), a distributed system designed to provide extremely reliable computation in real-time control systems. MAFT is based on the physical and functional partitioning of executive functions from application functions. The implementation of the executive functions in a special-purpose hardware processor allows the fault-tolerance functions to be transparent to the application porgrams and minimizes overhead...", url = "http://ieeexplore.ieee.org/iel1/12/134/00002183.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Reading @article{Vaidya95, author = "Vaidya, N.H. ; Pradhan, D.K.", title = "Degradable Byzantine agreement", journal = "IEEE Transactions on Computers 44,", year = "1995", pages = "146-50", number = "1", abstract = "Traditional Byzantine agreement protocols require all fault-free receivers to agree on an identical value. The proposed degradable agreement approach achieves traditional agreement up to m faults and a degraded form of agreement up to u faults (u>or=m), which allows fault-free receivers to agree on at most two different values (one of which is necessarily the default value). A degradable agreement algorithm and lower bounds are presented", url = "http://ieeexplore.ieee.org/iel1/12/8421/00367999.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Frison82, author = "Frison, S.G. ; Wensley, J.H. ", title = "Interactive consistency and its impact on the design of TMR systems", inbook = "FTCS 12th Annual International Symposium on Fault-Tolerant Computing. Digest of Papers", year = "1982", pages = "228-33", abstract = "It is well known that in a TMR system it is not possible to guarantee correct operation in situations where correctly functioning processors can legitimately have differing results and where one faulty processor carries out a pattern of activity that confuses the two correctly functioning processors. The paper analyzes this general problem as it applies to certain design issues that arise in building a fault tolerant TMR system. The issues addressed are synchronization, the processing of analog data, and the handling of interrupts. It is shown that certain simplistic solutions that have been applied in the past fail to provide a guarantee of complete coverage of faults. Approaches are described that, while not guaranteeing absolute fault tolerance, can be shown to provide fault tolerance except against an extremely unlikely pattern of behavior of a faulty processor", url = "http://ieeexplore-beta.ieee.org//iel3/3846/11214/00532667.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Kistler92, author = "Kistler, J.J. ; Satyanarayanan, M.", title = "Disconnected operation in the Coda File System", journal = "ACM Transactions on Computer Systems 10,", year = "1992", pages = "3-25", number = "1", abstract = "Disconnected operation is a mode of operation that enables a client to continue accessing critical data during temporary failures of a shared data repository. An important, though not exclusive, application of disconnected operation is in supporting portable computers. The authors show that disconnected operation is feasible, efficient and usable by describing its design and implementation in the Coda File System. The central idea is that caching of data, now widely used for performance, can also be exploited to improve availability", url = "http://citeseer.nj.nec.com/kistler92disconnected.html", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Shin89, author = "Shin, K.G. ; Dolter, J.W.", title = "Alternative majority-voting methods for real-time computing systems", journal = "IEEE Transactions on Reliability 38,", year = "1989", pages = "58-64", number = "1", abstract = "Two techniques that provide a compromise between the high time overhead in maintaining synchronous voting and the difficulty of combining results in asynchronous voting are proposed. These techniques are specifically suited for real-time applications with a single-source/single-sink structure that need instantaneous error masking. They provide a compromise between a tightly synchronized system in which the synchronization overhead can be quite high, and an asynchronous system which lacks suitable algorithms for combining the output data. Both quorum-majority voting (QMV) and compare-majority voting (CMV) are most applicable to distributed real-time systems with single-source/single-sink tasks. All real-time systems eventually have to resolve their outputs into a single action at some stage. The development of the advanced information processing system (AIPS) and other similar systems serve to emphasize the importance of these techniques. Time bounds suggest that it is possible to reduce the overhead for quorum-majority voting to below that for synchronous voting. All the bounds assume that the computation phase is nonpreemptive and that there is no multitasking", url = "http://ieeexplore.ieee.org/iel1/24/926/00024574.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Birman87, author = "Birman, K.P. ; Joseph, T.A.", title = "Reliable communication in the presence of failures", journal = "ACM Transactions on Computer Systems 5,", year = "1987", pages = "47-76", number = "1", abstract = "The design and correctness of a communication facility for a distributed computer system are reported on. The facility provides support for fault-tolerant process groups in the form of a family of reliable multicast protocols that can be used in both local- and wide-area networks. These protocols attain high levels of concurrency, while respecting application-specific delivery ordering constraints, and have varying cost and performance that depend on the degree of ordering desired. In particular, a protocol that enforces causal delivery orderings is introduced and shown to be a valuable alternative to conventional asynchronous communication protocols. The facility also ensures that the processes belonging to a fault-tolerant process group will observe consistent orderings of events affecting the group as a whole, including process failures, recoveries, migration, and dynamic changes to group properties like member rankings. A review of several uses for the protocols in the ISIS system, which supports fault-tolerant resilient objects and bulletin boards, illustrates the significant simplification of higher level algorithms made possible by our approach", url = "http://doi.acm.org/10.1145/7351.7478", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Chandra96, author = "Chandra, T.D. ; Toueg, S.", title = "Unreliable failure detectors for reliable distributed systems", journal = "Journal of the ACM 43,", year = "1996", pages = "225-67", number = "2", abstract = "We introduce the concept of unreliable failure detectors and study how they can be used to solve Consensus in asynchronous systems with crash failures. We characterise unreliable failure detectors in terms of two properties-completeness and accuracy. We show that Consensus can be solved even with unreliable failure detectors that make an infinite number of mistakes, and determine which ones can be used to solve consensus despite any number of crashes, and which ones require a majority of correct processes. We prove that Consensus and Atomic Broadcast are reducible to each other in asynchronous systems with crash failures; thus, the above results also apply to Atomic Broadcast. A companion paper shows that one of the failure detectors introduced here is the weakest failure detector for solving Consensus [Chandra et al., 1992]", url = "http://doi.acm.org/10.1145/226643.226647", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Cristian85, author = "Cristian, F. ; Aghili, H. ; Strong, R. ; Dolev, D. ", title = "Atomic broadcast: from simple message diffusion to Byzantine agreement", inbook = "Fifteenth Annual International Symposium on Fault-Tolerant Computing FTCS 15. Digest of Papers. ", year = "1985", pages = "200-6", abstract = "A new family of atomic broadcast protocols is presented that ranges from a very simple protocol that only tolerates omission faults to a rather sophisticated protocol that tolerates any type of fault. The more types of faults a protocol can tolerate, the more complex and expensive it is to build and run. The objective is to provide, for all considered fault classes, protocols that are cost-effective for those classes in terms of message traffic, speed, and complexity. The protocols work for arbitrary point-to-point network topologies, can tolerate any number of faults up to network partitioning, use a small number of messages, and achieve in many cases the best possible termination times", url = "http://ieeexplore-beta.ieee.org//iel3/3846/11214/00532668.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }