@article{mitra05_soft_error_resilience,
author = "Mitra, S.; Seifert, N.; Zhang, M.; Shi, Q.; Kim, K.S.",
title = "Robust system design with built-in soft-error resilience",
journal = "IEEE Computer",
year = "2005",
pages = "43-52",
number = "2",
volume = "38",
abstract = "Transient errors caused by terrestrial radiation pose a major
barrier to robust system design. A system's susceptibility to such
errors increases in advanced technologies, making the incorporation
of effective protection mechanisms into chip designs essential. A
new design paradigm reuses design-for-testability and debug resources
to eliminate such errors.",
url = "http://ieeexplore.ieee.org/iel5/2/30429/01401773.pdf",
}
@article{reibman91_reliability_modeling,
author = "A. Reibman and M. Veeraraghavan",
affiliation = "Bell Labs",
title = "Reliability modeling: an overview for system
designers",
organization = "Bell Labs",
year = "1991",
volume = "24",
number = "4",
pages = "49--57",
abstract = "The role of reliability models in system design is
examined. Methods for predicting system reliability
are discussed, covering the choice of metric for
analysis, creating the system reliability model, and
refining the model. A case study is presented to
illustrate reliability modeling.",
url = "http://ieeexplore.ieee.org/iel1/2/2541/00076262.pdf",
}
@inbook{dugan95_dependability,
author = "Joanne Bechta Dugan and Michael R. Lyu",
affiliation = "Bell Labs",
title = "Software Fault Tolerance",
editor = "Lyu",
organization = "University of Virginia, VA, USA",
year = "1995",
chapter = "5",
pages = "109--138",
abstract = " Three major fault-tolerant software system
architectures, distributed recovery blocks,
N-version programming, and N self-checking
programming, are modeled by a combination of fault
tree techniques and Markov processes. In these three
architectures, transient and permanent hardware
faults as well as unrelated and related software
faults are modeled in the system-level domain. The
model parameter values are determined from the
analysis of data collected from a fault-tolerant
avionic application. Quantitative analyses for
reliability and safety factors achieved in these
three fault-tolerant system architectures are
presented.",
url =
"http://www.ece.cmu.edu/~ece849/papers/dugan95_depend_modeling.pdf",
}
@article{schlichting83_failstop,
author = " Richard D. Schlichting and Fred B. Schneider",
title = "Fail-stop processors: an approach to designing
fault-tolerant computing systems",
journal = "Computer Systems",
volume = "1",
number = "3",
pages = "222-238",
year = "1983",
abstract = "A methodology that facilitates the design of
fault-tolerant computing systems is presented. It is
based on the notion of a failstop processor. Such a
processor automatically halts in response to any
internal failure and does so before the effects of
that failure become visible. The problem of
implementing processors that, with high probability,
behave like fail-stop processors is
addressed. Axiomatic program verification techniques
are described for use in developing provably correct
programs for failstop processors. The design of a
process control system illustrates the use of our
methodology.",
url =
"http://citeseer.ist.psu.edu/schlichting83failstop.html",
}
; Supplemental Readings
@Conference{bossen81_edfi,
author = "D. Bossen and M. Hsiao",
affiliation = "IBM, USA",
title = "ED/FI: A Technique for Improving Computer System
RAS",
booktitle = "Fault-Tolerant Computing 1995, Highlights from
Twenty-Five Years",
organization = "FTCS",
year = "1995",
abstract = "ED/FI (error detection and fault isolation is a
model for projecting the ability of a computer
system to dynamically detect hardware errors during
normal operation, and to automatically isolate the
fault causing the error based only on information
captured at the time the error is detected. This
general approach to fault isolation solves the
difficult problem of intermittent fault diagnosis
based on testing. This model has been used to
project the error detection and fault isolation
characteristics of a number of products, and
experimental results show good correlation with the
model's projections.",
url =
"http://ieeexplore.ieee.org/iel3/3846/11214/00532644.pdf",
opinions = "generally low ratings in 2005"
}
@Conference{bouricius71_reliability,
author = "W. Bouricius., W. Carter, D. Jessep, P. Schneider, &
A. Wadia",
affiliation = "IBM, USA",
title = "Reliability modeling for fault tolerant computers",
booktitle = "Fault-Tolerant Computing 1995, Highlights from
Twenty-Five Years",
organization = "FTCS",
year = "1995",
abstract = "Reliability modeling and the mathematical equations
involved are discussed for general computer systems
organized to be fault-tolerant. This paper
summarizes the work done over the last four years on
mathematical reliability modeling by the authors.",
url =
"http://ieeexplore.ieee.org/iel3/3846/11214/00532626.pdf",
}
@article{Sahner87,
author = "Sahner, R.A. ; Trivedi, K.S.",
title = "Reliability modeling using SHARPE",
journal = "IEEE Transactions on Reliability R-36,",
year = "1987",
pages = "186-93",
number = "2",
abstract = "The authors present an approach for avoiding the
large state-space problem. The approach uses a
hierarchical modeling technique for analyzing
complex reliability models. It allows the
flexibility of Markov models where necessary and
retains the efficiency of combinatorial solution
where possible. Based on this approach, a computer
program called SHARPE (symbolic hierarchical
automated reliability and performance evaluator) has
been written. The hierarchical modeling technique
provides a very flexible mechanism for using
decomposition and aggregation to model large
systems; it allows for both combinatorial and Markov
or semi-Markov submodels, and can analyze each model
to produce a distribution function. The choice of
the number of levels of models and the model types
at each level is left up to the modeler. Component
distribution functions can be any exponential
polynomial whose range is between zero and
one. Examples show how combinations of models can be
used to evaluate the reliability and availability of
large systems using SHARPE",
url =
"http://www.ece.cmu.edu/~ece749/papers/sahner87_sharpe.pdf",
}
@article{Barbara87,
author = "Barbara, D. ; Garcia-Molina, H.",
title = "The reliability of voting mechanisms",
journal = "IEEE Transactions on Computers C-36,",
year = "1987",
pages = "1197-208",
number = "10",
abstract = "In a faulty distributed system, voting is commonly
used to achieve mutual exclusion among groups of
isolated nodes. Each node is assigned a number of
votes, and any group with a majority of votes can
perform the critical operations. The problem of
selecting vote assignments in order to maximize the
probability that the critical operations can be
performed at a given time by some group of nodes is
addressed. Simple heuristics to assign votes are
suggested, and it is shown that they give good
results in most cases. Three particular homogeneous
topologies (fully connected, Ethernet, and ring
networks) are studied, and analytical expressions
for system reliability are derived that provide
useful insights into the reliability provided by
voting mechanisms",
url =
"http://www.ece.cmu.edu/~ece749/papers/barbara87_voting_reliability.pdf",
}
@Conference{Malhis95,
author = "Malhis, L.M. ; Sanders, W.H. ; Schlichting, R.D. ",
title = "Numerical evaluation of a group-oriented multicast
protocol using stochastic activity networks",
inbook = "Proceedings of the Sixth International Workshop on
Petri Nets and Performance Models ",
year = "1995",
pages = "63-72",
abstract = "Group-oriented multicast protocols that provide
message ordering and delivery guarantees are
becoming increasingly important in distributed
system design. However, despite the large number of
such protocols, little analytical work has been done
concerning their performance, especially in the
presence of message loss. This paper illustrates a
method for determining the performability of
group-oriented multicast protocols using stochastic
activity networks, a stochastic extension to Petri
nets, and reduced base model construction. In
particular, we study the performability of one such
protocol, called Psync, under a wide variety of
workload and message loss probabilities. The
specific focus is on measuring two quantities, the
stabilization time-that is, the time required for
messages to arrive at all hosts-and channel
utilization. The analysis shows that Psync works
well when message transmissions are frequent, but
exhibits extremely long message stabilization times
when transmissions are infrequent and message losses
occur. The results provide useful insight on the
behavior of Psync, as well as serve as a guide for
evaluating the performability of other
group-oriented multicast protocols",
url =
"http://ieeexplore.ieee.org/iel3/4022/11544/00524316.pdf",
}
@article{Rai87,
author = "Rai, S. ; Sarje, A.K. ; Prasad, E.V. ; Kumar, A.",
title = "Two recursive algorithms for computing the
reliability of k-out-of-n systems",
journal = "IEEE Transactions on Reliability R-36,",
year = "1987",
pages = "261-5",
number = "2",
abstract = "The authors present two recursive methods to compute
reliability of a k-out-of-n system. The method is
simple and computationally efficient when compared
with other current methods. Examples illustrate the
technique. The algorithms are presented in a
recursive language with an Algol-like notation. The
algorithms are easy to remember and can be used for
manual computations",
url =
"http://www.ece.cmu.edu/~ece749/papers/rai87_k_of_n_reliability.pdf",
}
@article{Abraham74,
author = "Abraham, J.A. ; Siewiorek, D.P.",
title = "An algorithm for the accurate reliability evaluation
of triple modular redundancy networks",
journal = "IEEE Transactions on Computers C-23,",
year = "1974",
pages = "682-92",
number = "7",
abstract = "There are several instances where the classical
method of triple-modular redundancy (TMR)
reliability modeling may provide predictions which
are inadequate. It is shown that for even simple
networks such as those exhibiting fan in and fan
out, classical methods may predict a reliability
that is higher than or lower than the actual
reliability. Furthermore, the classical method gives
no hint as to whether the predicted number is high
or low. As a solution to this problem, a method of
partitioning an arbitrary network into cells such
that faults in a cell are independent of faults in
other cells is proposed. An algorithm is then given
to calculate the reliability of any such cell, by
considering only the structure of the
interconnections within the cells. the value of the
reliability found is exact if TMR is assumed to be a
coherent system. An approximation to the algorithm
is also described; this can be used to find a lower",
url =
"http://www.ece.cmu.edu/~ece749/papers/abraham74_tmr_evaluation.pdf",
}
@article{Geist90,
author = "Geist, R. ; Trivedi, K.S.",
title = "Reliability estimation of fault-tolerant systems:
tools and techniques",
journal = "Computer 23,",
year = "1990",
pages = "52-61",
number = "7",
abstract = "A comparative evaluation of state-of-the-art tools
and techniques for estimating the reliability of
fault-tolerant computing systems is presented. The
theory of reliability estimation is briefly
reviewed. Five current approaches are compared in
detail: HARP (hybrid automated reliability
predictor), SURE (semi-Markov unreliability range
estimator), HEIRESS (hierarchical estimation of
interval reliability by skewed sampling), SHARPE
(symbolic hierarchical automated reliability and
performance evaluator), and SAVE (system
availability estimator). Particular attention is
given to design limitations imposed by underlying
model assumptions, on the one hand, and the
efficiency and accuracy of the solution techniques
employed, on the other hand",
url =
"http://ieeexplore-beta.ieee.org//iel1/2/2058/00056852.pdf",
}
@article{Cullyer89,
author = "Cullyer, W.J.",
title = "Implementing high integrity systems: the VIPER
microprocessor",
journal = "IEEE Aerospace and Electronics Systems Magazine 4,",
year = "1989",
pages = "5-13",
number = "6",
abstract = "The author describes the development of VIPER and
points out some of the practical problems
encountered over the four years of the
project. Informal proofs of correctness, carried out
in the early stages of the project, are outlined. A
peer review group criticized the lack of
multiplication and division instructions in VIPER
1. This deficiency is corrected in VIPER 2, and the
performance is increased to 3 MIPs",
url =
"http://ieeexplore.ieee.org/iel2/761/492/00009638.pdf",
}