; Hand this in to: ece849-staff+hw@ece.cmu.edu @inproceedings{ maffeis95adding, author = "Silvano Maffeis", title = "Adding Group Communication and Fault-Tolerance to {CORBA}", pages = "135--146", year = "1995", abstract = "Groupware and fault-tolerant distributed systems stimulate the need for structuring activities around objectgroups and reliable multicast communication. The objectgroup abstraction permits to treat a collection of networkobjects as if they were a single object; clients can invoke operations on object-groups without needing to know the exact membership of the group. Object-groups mainly serve to increase reliability through replication, performance through parallelism, or to distribute data from ...", url = "http://citeseer.ist.psu.edu/maffeis95adding.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @inproceedings{ felber96design, author = "P. Felber and B. Garbinato and R. Guerraoui", title = "The Design of a {CORBA} Group Communication Service", booktitle = "Proceedings of the 15th Symposium on Reliable Distributed Systems ({SRDS}-15)", address = "Niagara-on-the-Lake, Canada", pages = "150--159", year = "1996", abstract = "The Common Object Request Broker Architecture (CORBA) is becoming a middleware standard for distributed application development, and there are increasing needs in enriching the basic functionalities of CORBA. Whereas mechanisms for persistence, transactions, event channels, etc., have been designed and specified for CORBA, no support is provided to handle object replication. In this paper we discuss the issue of augmenting CORBA with group communication, which is considered an adequate paradigm ...", url = "http://citeseer.ist.psu.edu/felber96design.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @inproceedings{ narasimhan02_ftcorba_lessons, author = "Narasimhan, P.; Moser, L.E.; Melliar-Smith, P.M.; ", title = "Lessons Learned in Building a Fault-Tolerant CORBA system", booktitle = "Proceedings of the International Conference on Dependable Systems and Networks", pages = "39--44", year = "2002", abstract = "The Eternal system pioneered the interception approach to providing transparent fault tolerance for CORBA, which allows it to make a CORBA application reliable with little or no modification to the application or the ORB. The design and implementation of the Eternal system has influenced industrial practices by providing the basis for the specifications of the Fault-Tolerant CORBA standard that the Object Management Group adopted. In this paper, we discuss our experience in developing the Eternal system, with particular emphasis on the challenges that we encountered and the lessons that we learned.", url = "http://ieeexplore.ieee.org/iel5/7991/22107/01028884.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{ felber04_ftcorba_experiences, author = "Priya Narasimhan and Pascal Felber", title = "Experiences, Strategies and Challenges in Building Fault-Tolerant CORBA Systems", journal = "IEEE Transactions on Computers", volume = "54", number = "5", pages = "497--511", year = "2004", abstract = "It has been almost a decade since the earliest reliable CORBA implementation and, despite the adoption of the fault-tolerant CORBA (FT-CORBA) standard by the Object Management Group, CORBA is still not considered the preferred platform for building dependable distributed applications. Among the obstacles to FT-CORBA's widespread deployment are the complexity of the new standard, the lack of understanding in implementing and deploying reliable CORBA applications, and the fact that current FT-CORBA do not lend themselves readily to complex, real-world applications. We candidly share our independent experiences as developers of two distinct reliable CORBA infrastructures (OGS and Eternal) and as contributors to the FT-CORBA standardization process. Our objective is to reveal the intricacies, challenges, and strategies in developing fault-tolerant CORBA systems, including our own. Starting with an overview of the new FT-CORBA standard, we discuss its limitations, along with techniques for best exploiting it. We reflect on the difficulties that we have encountered in building dependable CORBA systems, the solutions that we developed to address these challenges, and the lessons that we learned. Finally, we highlight some of the open issues, such as nondeterminism and partitioning, that remain to be resolved.", url = "http://www.ece.cmu.edu/~mead/tocs-2004.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Reading @Conference{Merlin78, author = "Merlin, P.M. ; Randell, B. ", title = "State restoration in distributed systems", inbook = "FTCS-8. The Eighth Annual International Conference on Fault-Tolerant Computing", year = "1978", pages = "129-34", abstract = "This paper concerns an important aspect of the problem of designing fault-tolerant distributed computing systems. The concepts involved in `backward error recovery', i.e. restoring a system, or some part of a system, to a previous state which it is hoped or believed preceded the occurrence of any existing errors are formalised, and generalised so as to apply to concurrent, e.g. distributed, systems. Since in distributed systems there may exist a great deal of independence between activities, the system can be restored to a state that could have existed rather than to a state that actually existed. The formalisation is based on the use of what is termed `Occurrence Graphs' to represent the cause-effect relationships that exist between the events that occur when a system is operational, and to indicate existing possibilities for state restoration. A protocol is presented which could be used in each of the nodes in a distributed computing system in order to provide system recoverability in the face even of multiple faults", url = "http://ieeexplore-beta.ieee.org//iel3/3846/11214/00532636.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }