Rowan-Classes/5th-Semester-Fall-2023/Prob-and-Stat-for-ECEs/Notes/Prob-and-Stat-Notes.tex
2024-02-22 14:23:12 -05:00

852 lines
29 KiB
TeX

\documentclass{report}
\input{preamble}
\input{macros}
\input{letterfonts}
\title{\Huge{Prob and Stat for ECEs}}
\author{\huge{Aidan Sharpe}}
\date{}
\begin{document}
\maketitle
\newpage% or \cleardoublepage
% \pdfbookmark[<level>]{<title>}{<dest>}
\pdfbookmark[section]{\contentsname}{toc}
\tableofcontents
\pagebreak
\chapter{Probability Density and Cumulative Distribution Functions}
\ex{}{
Suppose there are 30 resistors, 7 of them do not work. You randomly
choose 3 of them. Let $X$ be the number of defective resistors. Find
the probability distribution of $X$.
$$ X = [0,3]$$
$$P(X=0) = { {7 \choose 0} {23 \choose 3} \over {30 \choose 3} } = 0.436$$
$$P(X=1) = { {7 \choose 1} {23 \choose 2} \over {30 \choose 3} } = 0.436$$
$$P(X=2) = { {7 \choose 2} {23 \choose 1} \over {30 \choose 3} } = 0.119$$
$$P(X=3) = { {7 \choose 3} {23 \choose 0} \over {30 \choose 3} } = 0.009$$
Probability distribution:
$$P(X = x) =
\begin{cases}
0.436 & x=0 \\
0.436 & x=1 \\
0.119 & x=2 \\
0.009 & x=3
\end{cases}$$
}
\dfn{The Cumulative Distribution Function}
{
The cumulative distribution function (CDF), $F(x)$, of a discrete random variable, $x$, with probability distribution, $f(x)$, is:
$$F(x) = P(X \le x)$$
}
\noindent
Find CDF for the example above:
$$F(0) = P(X \le 0) = P(X = 0) = 0.436$$
$$F(1) = P(X \le 1) = P((X = 0) \cup (X=1)) = 0.872$$
$$F(2) = P(X \le 2) = P((X=0) \cup (X=1) \cup (X=2)) = 0.991$$
Since 3 is the largest possible value for $x$:
$$F(3) = P(X \le 3) = 1$$
\noindent
As a piecewise function:
$$F(x) =
\begin{cases}
0 & x < 0 \\
0.436 & 0 \le x < 1 \\
0.872 & 1 \le x < 2 \\
0.991 & 2 \le x < 3 \\
1 & x \ge 3
\end{cases}$$
\ex{}{
Suppose that a days production of 850 manufactured parts contains 50
parts that to not conform to customer requirements. 2 parts are selected
at random from the batch. Let $X$ be the number of non-conforming
parts.
\paragraph{a)}
Find the probability distribution for $X$:
$$P(X = 0) = { {50 \choose 0} {800 \choose 2} \over {850 \choose 2 }} = 0.8857$$
$$P(X = 1) = { {50 \choose 1} {800 \choose 1} \over {850 \choose 2 }} = 0.1109$$
$$P(X = 2) = { {50 \choose 2} {800 \choose 0} \over {850 \choose 2 }} = 0.0034$$
$$P(X = x) =
\begin{cases}
0.8857 & x=0 \\
0.1109 & x=1 \\
0.0034 & x=2
\end{cases}$$
\paragraph{b)}
Find the CDF $F(x)$:
$$F(x) =
\begin{cases}
0 & x < 0 \\
0.8857 & 0 \le x < 1 \\
0.9966 & 1 \le x < 2 \\
1 & x \ge 2
\end{cases}$$
\paragraph{c)}
Plot $F(x)$:
}
\section{Continuous Probability Distributions}
A continuous random variable is a variable that can take on any value within a range. It takes on infinitely many possible value within the range.
\includegraphics{NormalDistribution.png}
\noindent
For a continuous distribution, $f(x)$:
$$P(X = x) = 0$$
$$P(x_0 \le X \le x_1) = \int\limits_{x_0}^{x_1} f(x) dx$$
$$P(X \ge x_0) = \int\limits_{x_0}^{\infty} f(x) dx$$
\dfn{}{
The function, $f(x)$, is a probability density function fo the
continuous random variable, $X$, defined over $\Reals$ if:
\begin{enumerate}
\item
$$f(x) \ge 0, \forall x \in \Reals$$
\item
$$\int\limits_{-\infty}^{\infty} f(x) dx = 1$$
\item
$$P(x_0 \le X \le x_1) = P(x_0 < X < x_1)$$ $$= P(x_0 \le X < x_1)$$
$$= P(x_0 < X \le x_1)$$
\end{enumerate}
}
\ex{}{
Suppose that the error in the reaction temperature in $^\circ \text{C}$ for a controlled lab experiment is a continuous random variable, $X$, having PDF:
$$f(x) =
\begin{cases}
{x^2 \over 3} & -1 < x < 2 \\
0 & \text{elsewhere}
\end{cases}$$
\paragraph{a)} Verify that $f(x)$ is a PDF.
$$\int\limits_{-1}^{2} {x^2 \over 3} dx \stackrel{?}{=} 1$$
$${1 \over 3} \left[{1 \over 3} x^3 \Big\vert_{-1}^{2}\right] = {1\over9}[8- (-1)] = 1$$
\paragraph{b)} Find $P(0 < X < 0.5)$:
$$P(0 < X < 0.5) = \int\limits_0^{0.5} {x^2 \over 3}dx$$
$${1\over9}\left[x^3 \Big|_0^{0.5}\right] = {1\over9}[0.125] = 0.01389$$
}
\dfn{}
{
The CDF, $F(x)$ of a continuous random variable, $X$, with probability density function $f(x)$ is:
$$F(x) = P(X \le x) = \int\limits_{-\infty}^x f(t) dt$$
}
\nt{
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
$$P(a < X < b) = F(b) - F(a)$$
\item
$$f(x) = {d\over dx}F(x)$$
\end{enumerate}
}
\ex{}{
Find the CDF of the previous example $$f(x) =
\begin{cases}
{x^2 \over 3} & -1 < x < 2 \\
0 & \text{elsewhere}
\end{cases}$$
$$F(x) = \int\limits_{-1}^x {t^2 \over 3} dt$$
$${1/over 9}\left[t^3\Big|_{-1}^x\right] = {1\over 9}\left[x^3 + 1\right]$$
$$F(x) = \begin{cases}
0 & t < -1 \\
{1\over 9} \left[x^3 + 1\right] & -1 \le x \le 2 \\
1 & \text{\text{elsewhere}}
\end{cases}$$
}
\ex{}{
The proportion of the budget for a certain type of industrial company that is allotted to environmental and pollution control is coming under scrutiny. A data collection project determines that the distribution of these proportions is given by: $$f(y) = \begin{cases}
k(1-y)^4 & 0 \le y \le 1 \\
0 & \text{elsewhere}
\end{cases}$$
Find $k$ that renders \$f(y) a valid density function:
$$\int\limits_0^1 k(1-y)^4dy = 1$$ $${k\over5} = 1$$
$$\therefore k = 5$$
}
\chapter{Expected Value}
\dfn{Expected Value}
{
Let $X$ be a random variable with probability distribution $f(x)$. The mean, or expected value, of $X$ is:
For a discrete distribution $$E[X] = \sum\limits_x xf(x)$$
For a continuous distribution:
$$E[X] = \int\limits_{-\infty}^{\infty} xf(x)dx$$
}
\noindent
Given $\{1, 2, 3, 3, 5\}$, the mean is: $${1+2+3+3+5 \over 5} = 2.8$$
$$f(x) = \begin{cases}
{1\over5} & x=1 \\
{1\over5} & x=2 \\
{2\over5} & x=3 \\
{1\over5} & x=5 \\
\end{cases}$$
$$\sum\limits_x xf(x) = {1\over5}(1) + {1\over5}(2) + {1\over5}(3) + {1\over5}(5) = 2.8$$
\ex{}
{
The probability distribution of a discrete random variable $X$ is:
$$f(x) = {3 \choose x}\left({1 \over 4}\right)^x\left({3\over4}\right)^{3-x}, x \in \{0, 1, 2, 3\}$$
Find $E[X]$: $$f(x) =
\begin{cases}
0 & x=0 \\
0.422 & x=1 \\
0.14 & x=2 \\
{1\over64} & x=3
\end{cases}$$
$$E[X] = \sum\limits_x x {3 \choose x}\left({1\over4}\right)^x \left({3\over4}\right)^{3-x}$$
$$E[X] = 0(0)+ 0.422(1) + 0.14(2) + {1\over64}(3) = 0.75$$
}
\hypertarget{example-1}{%
\subsubsection{Example}\label{example-1}}
Let $X$ be the random variable that denotes the life in hours of a
certain electronic device. The PDF is: $$f(x) =
\begin{cases}
{20000\over x^3} & x > 100 \\
0 & elsewhere
\end{cases}$$
Find the expected life of this type of device:
$$E[X] = \int\limits_{-\infty}^{\infty} xf(x)dx = \int\limits_{100}^{\infty}x{20000 \over x^3}dx = 200 \text{[hrs]}$$
\textbf{Note:} $$E[x^2] = \int\limits_{\infty}^{\infty}x^2f(x)dx$$
\hypertarget{properties-of-expectations}{%
\subsubsection{Properties of
Expectations}\label{properties-of-expectations}}
$$E(b) = b$$ Where $b$ is a constant $$E(aX) = aE[X]$$ Where $a$ is
a constant $$E(aX + b) aE[X] + b$$ $$E[X + Y] = E[X] + E[Y]$$ Where
$X$ and $Y$ are random variables
\hypertarget{example-2}{%
\subsubsection{Example}\label{example-2}}
Given: $$f(x) = \begin{cases}
{x^2\over3} & -1 < x < 2 \\
0 & \text{elsewhere}
\end{cases}$$ Find the expected value of $Y = 4X + 3$:
$$E[Y] = E[4X + 3] = 4E[X] + 3$$
$$E[X] = \int\limits_{-1}^{3} {X^3 \over 3}dx = {1\over12}X^4 \Big|_{-1}^{3}={5\over4}$$
\hypertarget{variance-of-a-random-variable}{%
\subsubsection{Variance of a Random
Variable}\label{variance-of-a-random-variable}}
The expected value/mean is of special importance because it describes
where the probability distribution is centered. However, we also need to
characterize the variance of the distribution.
\hypertarget{definition-1}{%
\subsubsection{Definition}\label{definition-1}}
Let $X$ be a random variable with probability distribution, $f(x)$,
and mean, $\mu$. The variance of $X$ is given by:
$$\text{Var}[X] = E[(X-\mu)^2]$$ Which is the average squared distance
away from the mean. This simplifies to:
$$\text{Var}[X] = E[X^2] - E[X]^2$$ \textbf{Note:} Generally,
$$E[X^2] \ne E[X]^2$$
The standard deviation, $\sigma$, is given by:
$$\sigma = \sqrt{\text{Var}[X]}$$
\textbf{Note}: The variance is a measure of uncertainty (spread) in the
data.
\hypertarget{example-3}{%
\subsubsection{Example}\label{example-3}}
The weekly demand for a drinking water product in thousands of liters
from a local chain of efficiency stores is a continuous random variable,
$X$, having the probability density: $$F(x) = \begin{cases}
2(x-1) & 1 < x < 2 \\
0 & \text{elsewhere}
\end{cases}$$
Find the expected value:
$$E[X] = \int\limits_1^2 x (2(x-1)) dx = 2\int\limits_1^2 (x^2 - x)dx$$
$$E[X] = 2\left[{1\over3}x^3 - {1\over2}x^2 \Big|_1^2 \right] = {5\over3}$$
Find the variance: $$\text{Var}[X] = E[X^2] - E[X]^2$$
$$E[X^2] = \int\limits_1^2 2x^2(x-1)dx = 2\int\limits_1^2 (x^3 - x^2)dx$$
$$E[X^2] = {17\over6}$$
$$\text{Var}[X] = {17\over6} - \left({5\over3}\right)^2 = {1\over18}$$
Find the standard deviation:
$$\sigma = \sqrt{\text{Var}[X]} = {1\over3\sqrt{2}} = {\sqrt{2}\over6}$$
\hypertarget{example-4}{%
\subsubsection{Example}\label{example-4}}
The mean and variance are useful when comparing two or more
distributions.
\begin{longtable}[]{@{}lll@{}}
\toprule()
& Plan 1 & Plan 2 \\
\midrule()
\endhead
Avg Score Improvement & $+17$ & $+15$ \\
Standard deviation & $\pm8$ & $\pm2$ \\
\bottomrule()
\end{longtable}
\hypertarget{theorem}{%
\subsubsection{Theorem}\label{theorem}}
If $X$ has variance, $\text{Var}[X]$, then
$\text{Var}[aX + b] = a^2\text{Var}[X]$.
\hypertarget{example-5}{%
\subsubsection{Example}\label{example-5}}
The length of time, in minutes, for an airplane to obtain clearance at a
certain airport is a random variable, $Y = 3X - 2$, where $X$ has
the density: $$F(x) = \begin{cases}
{1\over4} e^{x/4} & x > 0 \\
0 & \text{elsewhere}
\end{cases}$$
$$E[X] = 4$$ $$\text{Var}[X] = 16$$
Find $E[Y]$: $$E[Y] = E[3X-2] = 3E[X] - 2 = 10$$
$$\text{Var}[Y] = 3^2\text{Var}[X] = 144$$
$$\sigma = \sqrt{\text{Var}[Y]} = 12$$
\hypertarget{the-exponential-distribution}{%
\subsection{The Exponential
Distribution}\label{the-exponential-distribution}}
The continuous random variable, $X$, has an exponential distribution
with parameter $\beta$ if its density function is given by:
$$F(x) = \begin{cases}
{1\over\beta}e^{-x/\beta} & x > 0 \\
0 & \text{elsewhere}
\end{cases}$$
Where $\beta > 0$.
$$E[X] = \beta$$
$$E[X] = \int\limits_0^{\infty} x{1\over\beta}e^{-x/\beta} dx$$
Aside: $$\Gamma(Z) = \int\limits_0^\infty x^{Z - 1}e^{-x}dx$$ Where
$\Gamma(Z) = (Z - 1)!$
$$E[X] = \beta \int\limits_0^\infty \left({x\over\beta}\right)^{(2-1)} e^{-x/\beta} \left({dx\over\beta}\right) = \beta\Gamma(2)$$
$$E[X] = \beta(2-1)! = \beta$$
$$\text{Var}[X] = E[X^2] - E[X]^2$$
$$E[X^2] = \int\limits_0^\infty x^2{1\over\beta}e^{-x/\beta}dx = \beta^2 \int\limits_0^\infty \left({x\over\beta}\right)^{(2-1)} e^{-x/\beta} \left({dx\over\beta}\right)$$
$$E[X^2] = \beta^2\Gamma(3) = 2\beta^2$$
$$\text{Var}[X] = 2\beta^2 - \beta^2 = \beta^2$$
\hypertarget{application}{%
\paragraph{Application}\label{application}}
Reliability analysis: the time to failure of a certain electronic
component can be modeled by an exponential distribution.
\hypertarget{example-6}{%
\subsubsection{Example}\label{example-6}}
Let $T$ be the random variable which measures the time to failure of a
certain electronic component. Suppose $T$ has an exponential
distribution with $\beta = 5$.
$$F(x) = \begin{cases}
{1\over5}e^{-x/5} & x > 0 \\
0 & \text{elsewhere}
\end{cases}$$
If 6 of these components are in use, what is the probability that
exactly 3 components are still functioning at the end of 8 years?
What is the probability that an individual component is still
functioning after 8 years?
$$P(T > 8) = \int\limits_8^\infty {1\over5}e^{-x/5}dx \approx 0.2$$
$${6 \choose 3}(0.2)^3(0.8)^3 = 0.08192$$
\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{\textgreater{}\textgreater{}\textgreater{}} \ImportTok{from}\NormalTok{ math }\ImportTok{import}\NormalTok{ comb}
\OperatorTok{\textgreater{}\textgreater{}\textgreater{}}\NormalTok{ comb(}\DecValTok{6}\NormalTok{,}\DecValTok{3}\NormalTok{) }\OperatorTok{*} \FloatTok{0.2}\OperatorTok{**}\DecValTok{3} \OperatorTok{*} \FloatTok{0.8}\OperatorTok{**}\DecValTok{3}
\FloatTok{0.08192000000000003}
\end{Highlighting}
\end{Shaded}
\hypertarget{the-normal-distribution}{%
\subsection{The Normal Distribution}\label{the-normal-distribution}}
The most important continuous probability distribution in the field of
statistics is the normal distribution. It is characterized by 2
parameters, the mean, $\mu$, and the variance, $\sigma^2$.
$$\text{mean} = \text{median} = \text{mode}$$
$$F(x|\mu,\sigma^2) = {1 \over \sqrt{2\pi} \sigma^2} e^{\left({1 \over 2\sigma^2}(x-\mu)^2\right)}$$
$$E[X] = \mu$$ $$\text{Var}[X] = \sigma^2$$
For a normal curve:
$$P(x_1 < x < x_2) = \int\limits_{x_1}^{x_2} F(x)dx$$
\hypertarget{definition-2}{%
\subsubsection{Definition}\label{definition-2}}
The distribution of a normal variable with mean 0 and variance 1 is
called a standard normal distribution.
The transformation of any random variable, $X$ into a standard normal
variable, $Z$: $$Z = {X - \mu \over \sigma}$$
\hypertarget{example-7}{%
\subsubsection{Example}\label{example-7}}
Given a normal distribution with mean $\mu = 30$ and standard
deviation, $\sigma = 6$, find the normal curve area to the right of
$x = 17$.
Transform to standard normal. $$Z = {17 - 30 \over 6} = -2.16$$
That is, $x = 17$ on a normal distribution with $\mu = 30$ and
$\sigma = 6$ is equivalent to $Z=-2.16$ on a normal distribution
with $\mu = 0$ and $\sigma = 1$.
$$P(X > 17) = P(Z > -2.16)$$
$$P(Z > -2.16) = 1 -P(Z \le -2.16) = 0.9846$$
\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{\textgreater{}\textgreater{}\textgreater{}} \ImportTok{from}\NormalTok{ scipy.stats }\ImportTok{import}\NormalTok{ norm}
\OperatorTok{\textgreater{}\textgreater{}\textgreater{}}\NormalTok{ norm.cdf(}\OperatorTok{{-}}\FloatTok{2.16}\NormalTok{)}
\FloatTok{0.015386334783925445}
\end{Highlighting}
\end{Shaded}
\hypertarget{example-8}{%
\subsubsection{Example}\label{example-8}}
The finished inside diameter of a piston ring is normally distributed
with mean, $\mu = 10${[}cm{]}, and standard deviation,
$\sigma = 0.03${[}cm{]}.
What is the probability that a piston ring will have inside diameter
between 9.97{[}cm{]} and 10.03{[}cm{]}?
$$Z_1 = {9.97 - 10 \over 0.03} = -1$$ $$Z_2 = {10.03 - 10 \over 3} = 1$$
$$P(9.97 < x < 10.03) = 0.68$$
\begin{Shaded}
\begin{Highlighting}[]
\OperatorTok{\textgreater{}\textgreater{}\textgreater{}} \ImportTok{from}\NormalTok{ scipy.stats }\ImportTok{import}\NormalTok{ norm}
\OperatorTok{\textgreater{}\textgreater{}\textgreater{}}\NormalTok{ norm.cdf(}\DecValTok{1}\NormalTok{) }\OperatorTok{{-}}\NormalTok{ norm.cdf(}\OperatorTok{{-}}\DecValTok{1}\NormalTok{)}
\FloatTok{0.6826894921370859}
\end{Highlighting}
\end{Shaded}
\chapter{Hypothesis Testing}
There is a 1 to 1 relationship between the test of a hypothesis about any parameter, say, $\theta$, and the confidence interval for $\theta$.
\dfn{}
{
If (LB, UB) is a 100(1-$\alpha$)\% confidence interval for the parameter, $\theta$, the test of size $\alpha$ of the hypothesis:
\begin{itemize}
\item[$H_0$:] $\theta = \theta_0$
\item[$H_a$:] $\theta \ne \theta_0$
\end{itemize}
will lead to a rejection of $H_0$ if and only if $\theta_0$ is not in the 100(1-$\alpha$)\% confidence interval.
}
\ex{}{
Consider homework 6, problem 1. We had $\bar{x} = 664$ and $s = 500$. Test the hypothesis:
\begin{itemize}
\item[$H_0$:] $\mu = 634$
\item[$H_a$:] $\mu \ne 634$
\item[$\alpha$:] 0.05
\end{itemize}
The 95\% confidence interval for $\mu$:
$$\bar{x} \pm t^* {s\over\sqrt{n}}$$
$$t^* = 1.961$$
$$664 \pm 1.961 {500 \over \sqrt{1700}}$$
$$(640.22, 687.78)$$
Since the value indicated by $H_0$ (634) is not within the 95\% confidence interval, it is not a plausible value for $\mu$, and thus we reject $H_0$ at $\alpha = 0.05$.
\\
\\
Sample evidence suggests that the average number of social ties for a cell phone user is significantly different from 634.
}
\subsection{Confidence Interval for the Difference of Two Means}
If $\bar{x}_1$ and $\bar{x}_2$ are the means of independent random samples of size $n_1$ and $n_2$ from approximately normal populations with unknown but equal variances, a 100(1-$\alpha$)\% confidence interval for $\mu_1 - \mu_2$ is given by:
$$(\bar{x}_1 - \bar{x}_2) \pm t_{\alpha \over 2} s_p \sqrt{{1\over n_1} + {1\over n_2}}$$
$$s_p^2 = {(n_1-1)s_1^2 + (n_2-1)s_2^2 \over n_1 + n_2 -2}$$
Where:
\begin{itemize}
\item[$t_{\alpha \over 2}$] is the t-value with $n_1 + n_2 - 2$ degrees of freedom
\end{itemize}
\ex{}
{
Homework 7, problem 3: burn time for fuses.
\begin{center}
\begin{tabular}{ c | c }
Supplier A & Supplier B \\
\hline
$n_1 = 30$ & $n_2 = 30$ \\
$\bar{x}_1 = 30.62$ & $\bar{x}_2 = 31.37$ \\
$s_1^2 = 0.384$ & $s_2^2 = 0.185$
\end{tabular}
\end{center}
Does the sample suggest that the mean burn time for supplier A is different than that for supplier B? Use $\alpha = 0.05$.
\begin{itemize}
\item[$H_0$:] $\mu_1 = \mu_2$
\item[$H_a$:] $\mu_1 \ne \mu_2$
\end{itemize}
The 95\% confidence interval for $\mu_1 - \mu_2$:
$$(\bar{x}_1 - \bar{x}_2) \pm t_{\alpha \over 2} s_p \sqrt{{1\over n_1} + {1\over n_2}}$$
$$s_p^2 = {(30-1)0.384 + (30-1)0.185 \over 30 + 30 - 2} = 0.2845$$
$$\therefore s_p = 0.5334$$
$$t_{\alpha \over 2} = 2.002$$
$$(30.62 - 31.37) \pm 2.002(0.5534)\sqrt{{1\over30} + {1\over30}}$$
$$(-1.02, -0.47)$$
We are 95\% confident that the difference of mean burn time between supplier A and supplier B is somewhere between -1.02 and -0.47.
\\
\\
Since 0 is not in the confidence interval, we reject $H_0$. Sample evidence suggests that the mean burn time for supplier A is different than that for supplier B.
}
\nt{
Since the entire confidence interval is below zero, we conclude with 95\% confidence that $\mu_1 - \mu_2 < 0$ and by extension, $\mu_1 < \mu_2$.
}
\\
\\
\noindent
The usefulness of using confidence intervals for significance testing:
\begin{itemize}
\item A confidence interval provides information about the magnitude and direction of the difference between $\mu_1$ and $\mu_2$.
\item However, a hypothesis test does not provide such information. It only provides information about significance.
\end{itemize}
\subsection{Paired t-test}
Comparing two treatments where observations occur in pairs or are related to each other.
\ex{}
{
Two teaching methods to be compared by using 50 students divided into two equal classes.
\\
\\
Method 1:
Randomly assign 25 students to each class and compare average scores when experiment is concluded.
\\
\\
What if one group gets better students? It would no longer be a fair comparrison of the two methods.
In this case, there would be two sources of variation:
\begin{enumerate}
\item Due to teaching method
\item Due to differences between students
\end{enumerate}
This inflates the variance and leads to lower power.
\\
\\
Possible solution:
Pair students according to preference/ability. This would have mainly variance due to teaching method.
}
\ex{}
{
10 adult males between the ages of 35 and 50 participated in a study to evaluate the effect of diet and exercise on blood cholesterol levels.
The total cholesterol was measured in each subject initially and three months after.
\begin{center}
\begin{tabular}{c | c | c | c}
Subject & Before & After & Difference\\
\hline
1 & 265 & 229 & 36 \\
2 & 240 & 231 & 9 \\
3 & 258 & 227 & 31 \\
4 & 295 & 240 & 55 \\
5 & 251 & 238 & 13 \\
6 & 245 & 241 & 4 \\
7 & 287 & 234 & 53 \\
8 & 314 & 256 & 58 \\
9 & 260 & 247 & 13\\
10 & 279 & 239 & 40
\end{tabular}
\end{center}
Run a one-sample t-test on the differences:
\begin{itemize}
\item[$H_0$:] $\mu_d = 0$
\item[$H_a$:] $\mu_d \ne 0$
\end{itemize}
Test statistic:
$$t^* = {\bar{d} - 0 \over {s_d \over \sqrt{n}}}$$
$$\bar{d} = {1\over n} \sum_{i=1}^n d_i$$
$$s_d^2 = {1 \over n-1} \sum_{i=1}^n(d_i - \bar{d})^2$$
Do the data support the claim that the diet and exercise are of value in production of a mean reduction in blood cholesterol levels using $\alpha = 0.05$?
$$t^* = {31.2 \over {20.43 \over \sqrt{n}}} = 4.829$$
The associated p-value is very close to 0. Since the p-value less than $\alpha$, we reject $H_0$. Sample evidence strongly suggests that diet and exercise are of value in producing an effect in blood cholesterol levels.
}
\nt
{
With the pairing approach, we have our degrees of freedom compared to the two sample approach.
\\
\\
However, if paired observations are highly similar or related, the reduction in variance more than compensates for the loss of degrees of freedom.
}
\section{Analysis of Variance}
Consider the problem of deciding whether observed differences among more than two sample means can be attributed to chance, or whether there are real differences among the populations being sampled.
\begin{itemize}
\item[$H_0$:] $\mu_1 = \mu_2 = \cdots = \mu_t$
\item[$H_a$:] at least one mean differs.
\end{itemize}
\ex{}
{
Consider the following observations:
\begin{center}
\begin{tabular}{c | c | c}
Group 1 & Group 2 & Group 3 \\
\hline
77 & 72 & 76 \\
81 & 58 & 85 \\
71 & 84 & 82 \\
76 & 66 & 80 \\
80 & 70 & 88
\end{tabular}
\end{center}
Suppose our data can be written as:
$$Y_{i j} = \mu_i + \veps_{i j}$$
Where:
\begin{itemize}
\item[$\veps_{i j}$] deviation from the group means
\item[$\mu_u$] $i^\text{th}$ group mean
\item[$Y_{i j}$] $j^\text{th}$ observation from $i^\text{th}$ group.
\end{itemize}
To infer if at least one $\mu_i$ differs from the others, compare the variance within the groups against the variance between the groups.
}
\noindent
The sum of the observations in the $i^\text{th}$ group:
$$y_{i \cdot} = \sum_{j=1}^r y_{i j}$$
The overall sum:
$$y_{\cdot \cdot} = \sum_{i=1}^t \sum_{j=1}^r y_{i j}$$
The mean of observations in the $i^\text{th}$ group:
$$\bar{y}_{i \cdot} = {1\over r} \sum_{j = 1}^r y_{i j}$$
The overall mean:
$$\bar{y}_{\cdot \cdot} = {1 \over r t} \sum_{i=1}^t \sum_{j=1}^r y_{i j}$$
\noindent
Decompose the observation:
$$y_{i j} - \bar{y}_{\cdot \cdot} = y_{i j} + \bar{y}_{i \cdot} - \bar{y}_{i \cdot} - \bar{y}_{\cdot \cdot} = (\bar{y}_{i \cdot} - \bar{y}_{\cdot \cdot}) + (y_{i j} - \bar{y}_{i \cdot})$$
The deviation of an observation from the grand mean is the same as the sum of the deviation of the treatment mean from the grand mean and the deviation of the observation from its treatment mean.
\thm{}
{
$$\sum_{i = 1}^t \sum_{j = 1}^r (y_{i j} - \bar{y}_{\cdot \cdot})^2 = r\sum_{i = 1}^t (\bar{y}_c - \bar{y}_{\cdot \cdot})^2 + \sum_{i = 1}^t \sum_{j = 1}^r (y_{i j} - \bar{y}_{i \cdot})^2$$
The variability of observations about the grand mean is the sum of the variability of treatment means about the grand mean and the variability of observations about their treatment means.
SSTotal = SSTreatment + SSError
}
\noindent
The Anova Table:
\begin{center}
\begin{tabular}{c | c | c | c}
Source of Variation & Sum of Squares & Degrees of Freedom & Mean Squares \\
\hline
Treatment (between groups) & SSTreat & t - 1 & ${\text{SSTreat} \over t -1}$ \\
Error (within groups) & SSE & $t(r - 1)$ & ${\text{SSE} \over t(r-1)}$ \\
Total & SSTotal & $tr - 1$
\end{tabular}
\end{center}
\nt
{
\begin{enumerate}
\item dfTotal = dfTreatment + dfError
\item $\sigma^2$ is estimated by MSE = ${\text{SSE} \over t(r -1)}$ which is a pooled estimate of $\sigma^2$ from all the data in the experiment.
\end{enumerate}
}
\subsection{Tests of Hypotheses}
\begin{itemize}
\item[$H_0$:] $\mu_1 = \mu_2 = \cdots = \mu_t$
\item[$H_a$:] at least one mean differs
\end{itemize}
\noindent
Assumptions:
\begin{enumerate}
\item $E[\veps_{i j}] = 0$
\item Var$[\veps_{i j}] = \sigma^2$
\item Cov$[\veps_{i j}, \veps_{i' j'}] = 0$; $i \ne i'$, $j \ne j'$
\item $\veps_{i j}$ has a normal distribution
\end{enumerate}
\noindent
Test statistic:
$$F = {\text{MSTreat} \over \text{MSE}} \sim F_{t-1, t(r-1)}$$
If $H_0$ is true:
$$F = {\text{MSTreat} \over \text{MSE}} \approx 1$$
If $H_a$ is true, $F_\text{obs} > 1$ and increases as treatment differences increase.
P-value:
$$P(F_{t-1, t(r-1)} \ge F_\text{obs})$$
\ex{}
{
Three types of signals wer utilized in a study to investigate traffic delay. Three types of traffic signals wer utilized in the study:
\begin{enumerate}
\item pretimesd
\item semi-actuated
\item fully actuated
\end{enumerate}
Five intersections were used for each type of signal. The measure of traffic delay used in the study was the average stopped time per vehicle at each intersection. The data are given by:
\begin{center}
\begin{tabular}{c | c | c}
Pretimed & Semi-Actuated & Fully Actuated \\
\hline
36.6 & 17.5 & 15.0 \\
39.2 & 20.6 & 10.4 \\
30.4 & 18.7 & 18.9 \\
37.1 & 25.7 & 10.5 \\
34.1 & 22.0 & 15.2
\end{tabular}
\end{center}
Compute the analysis of variance:
$$\bar{y}_{\cdot \cdot} = {1 \over 15}(36.6 + 39.2 + \cdots + 15.2) = 23.46$$
$$\bar{y}_{1 \cdot} = {1 \over 5}(36.6 + 39.2 + 30.4 + 37.1 + 34.1) = 35.48$$
$$\bar{y}_{2 \cdot} = 20.9$$
$$\bar{y}_{3 \cdot} = 14$$
$$\text{SSTotal} = \sum_{i = 1}^3 \sum_{j=1}^5 (y_{i j} - \bar{y}_{\cdot \cdot})^2 = 1340.456$$
$$\text{SSTreat} = 5\sum_{i=1}^3 (\bar{y}_{i \cdot} - \bar{y}_{\cdot \cdot})^2 = 1202.626$$
$$\text{SSE} = \text{SSTotal} - \text{SSTreat} = 137.83$$
\begin{center}
\begin{tabular}{c | c | c | c}
Source of Variation & Sum of Squares & Degrees of Freedom & Mean of Squares \\
\hline
Treatments & 1202.626 & 3 - 1 = 2 & 601.313 \\
Error & 137.83 & 3(5-1) = 12 & 11.4858 \\
Total & 1340.456 & 14
\end{tabular}
\end{center}
$$F_\text{obs} = {\text{MSTreat} \over \text{MSE}} = {601.313 \over 11.48} = 52.35$$
For such a high observed $F$, the p-value is very close to zero. Sample evidence suggests that the mean delays of the three types of traffic signals differ.
}
\section{Non-Parametric Testing}
Most of the hypothesis testing and confidence interval procedures have been based on the assumption that the samples are random from normally distributed populations. These are called parametric methods. Non-parametric or distribution-free methods make no assumptions about the distribution of the underlying population.
\subsection{The Signed Rank Test}
The only assumption is that the data is continuous and comes from a symmetric distribution.
\begin{enumerate}
\item Compute the differences $X_i - \mu_0, i=1\cdots n$.
\item Compute the absolute differences $|X_i - \mu_0|$ in ascending order
\item Compute $w^+$, the sum of the positive ranks, and $w^-$, the sum of the absulute negative ranks
\item The test statistic is given by:
$$w^\text{observed} = \min(w^-, w^+)$$
\item Use lookup table and reject $H_0$ if $w^\text{observed} \le w_\alpha^*$.
\end{enumerate}
\\
\\
\noindent
For a one-sided test, if the alternative hypothesis is $\mu > \mu_0$, then $w^\text{observed} = w^-$. If the alternative hypothesis is $\mu < \mu_0$, then $w^\text{observed} = w^+$.
\ex{}
{
A report on a study in which a rocket motor is formed by binding an igniter propellant and a sustainer propellant together inside a metal housing. The shear strength of the bond between the two types of propellant types is an important characteristic. The results of testing 10 randomly selected motors are shown below. Do the data suggest that the mean shear strength is different from 2000 psi using $\alpha = 0.05$.
\begin{center}
\begin{tabular}{ c | c | c | c }
Observation & $X_i$ & $X_i - \mu_0$ & Rank \\
\hline
1 & 2158.7 & 157.7 & 2 \\
2 & 1678.15 & -321.85 & 8 \\
3 & 2316.00 & 316.00 & 7 \\
4 & 2016.00 & 16.00 & 1 \\
5 & 2207.5 & 207.5 & 3 \\
6 & 1708.3 & -291.70 & 6 \\
7 & 1784.7 & -215.3 & 4 \\
8 & 2575.10 & 575.10 & 10 \\
9 & 2357.9 & 357.90 & 8 \\
10 & 2256.7 & 256.7 & 5\\
\end{tabular}
\end{center}
For this two-sided test:
\\
$H_0$: $\mu = 2000$
\\
$H_a$: $\mu \ne 2000$
The sum of the positive ranks:
$$w^+ = 2 + 7 + 1 + 3 + 10 + 9 + 5 = 37$$
The sum of the negative ranks:
$$w^- = 8 + 6 + 4 = 18$$
$$w^\text{observed} = \min(w^-, w^+) = 18$$
\\
The critical value, $w^*_\alpha$, is found in a lookup table. In this case it is 8. Since the observed test statistic is greater than this critical value, we fail to reject $H_0$. Sample evidence does not suggest that the mean shear strength is different from 2000 psi.
}
\\
\\
\noindent
Where does $w^*_\alpha$ come from?
\end{document}