## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
In the example above the variables are country
,
continent
, year
, lifeExp
,
pop
, and gdpPercap
.
library(gapminder)
head(gapminder)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
In the dataset above, the continuous variables are
lifeExp
, pop
, and gdpPercap
. The
year
variable could be considered
discrete.
In the dataset above, the qualitative variables are
country
and continent
. These are examples of
nominal variables.
- Relative frequency histograms are more useful when comparing histograms from samples of different sample sizes. (서로 다른 n을 가진 샘플들 간의 비교 시에 더 용이함)
hist(gapminder$lifeExp, main = "Frequency Histogram", xlab = "Life Expectancy (years)")
hist(gapminder$lifeExp, main = "Relative Frequency Histogram", freq = FALSE, xlab = "Life Expectancy (years)")
- Using
freq = FALSE
option, a frequency histogram can be converted into a relative frequency histogram easily.
- The only difference between these two histograms is the y-axis (and the title). The shape and bins remain the same.
- The proportions of the observations in each bin adds to 1 in a relative frequency histogram.
- Note that the number of bins in the histogram controls the amount of infomration loss: the larger the bin width the more information we lose
- Too many bins do not allow us to identify interesting patterns within the data because they get lost in the noise.
breaks()
option to change this.R
runs its algorithm to break up the data but it
gives you generally what you want.par(mfrow = c(2, 2))
hist(gapminder$lifeExp, breaks = 5, main = "Breaks = 5", xlab = "Life Expectancy (yrs")
hist(gapminder$lifeExp, breaks = 10, main = "Breaks = 10", xlab = "Life Expectancy (yrs)")
hist(gapminder$lifeExp, breaks = 30, main = "Break = 30", xlab = "Life Expectancy (yrs)")
hist(gapminder$lifeExp, breaks = 100, main = "Breaks = 100", xlab = "Life Expectancy (yrs)")
Question: whether there is an optimal number of bins.
R’s default is equi-spaced breaks in which the area of each bin is proportional to the number of points in each bin.
\[ k=log_2n+1 \]
\[ f_{n}(x)=\frac{1}{n h} \sum_{i=1}^{n} K\left(\frac{x-x_{i}}{h}\right) \]
- \(h\): bandwidth
- \(K(.)\): Kernel function – Usually standard Gaussian density is used as kernel
\[ K(x)=\frac{1}{\sqrt{2 \pi}} e^{-x^{2} / 2} \]
x <- sort(c(19, 20, 10, 17, 16, 13, 16, 10, 7, 18))
kde <- density(x, kernel = "gaussian", bw = 2)
A.kernel <- lapply(x, function(i) {
density(i, kernel = "gaussian", bw = 2)
})
for (i in seq_along(A.kernel)) {
A.kernel[[i]][["y"]] <- A.kernel[[i]][["y"]]/length(x)
}
plot(x, rep(0, length(x)), col = "red", pch = 17, xlim = c(min(kde$x), max(kde$x)),
ylim = c(0, max(kde$y)), ylab = "Density", main = "Construction of Kernel Density Estimation")
plot(x, rep(0, length(x)), col = "red", pch = 17, xlim = c(min(kde$x), max(kde$x)),
ylim = c(0, max(kde$y)), ylab = "Density", main = "Construction of Kernel Density Estimation")
lines(A.kernel[[1]], col = "red")
plot(x, rep(0, length(x)), col = "red", pch = 17, xlim = c(min(kde$x), max(kde$x)),
ylim = c(0, max(kde$y)), ylab = "Density", main = "Construction of Kernel Density Estimation")
for (j in c(1, 3)) {
lines(A.kernel[[j]], col = "red")
}
plot(x, rep(0, length(x)), col = "red", pch = 17, xlim = c(min(kde$x), max(kde$x)),
ylim = c(0, max(kde$y)), ylab = "Density", main = "Construction of Kernel Density Estimation")
for (j in seq_along(A.kernel)) {
lines(A.kernel[[j]], col = "red")
}
plot(x, rep(0, length(x)), col = "red", pch = 17, xlim = c(min(kde$x), max(kde$x)),
ylim = c(0, max(kde$y)), ylab = "Density", main = "Construction of Kernel Density Estimation")
for (j in seq_along(A.kernel)) {
lines(A.kernel[[j]], col = "red")
}
lines(kde)
hist(gapminder$lifeExp, probability = TRUE, main = "Histogram with a Kernal Density",
xlab = "Life Expectancy (yrs)")
lines(density(gapminder$lifeExp), col = 2, lwd = 2)
There is a bias-variance trade-off in the choice of the bandwidth:
- the larger the bandwidth, the smoother the density (low variance, high bias)
- the smaller the bandwidth, the less smooth the density (high variance, low bias)
plot(density(gapminder$lifeExp, bw = 10), ylim = c(0, 0.05), main = "Density with bandwidth of 10",
lwd = 2, xlab = "Life Expectancy (yrs)")
b <- c(5, 3, 1, 0.5) #this is a vector for the different bandwidths used later
plot(density(gapminder$lifeExp, bw = 10), ylim = c(0, 0.05), main = "Densities with bandwidth of 10 and 5",
lwd = 2, xlab = "Life Expectancy (yrs)")
lines(density(gapminder$lifeExp, bw = b[1]), col = 2, lwd = 2)
plot(density(gapminder$lifeExp, bw = 10), ylim = c(0, 0.05), main = "Densities with bandwidths = 10, 5, 3",
lwd = 2, xlab = "Life Expectancy (yrs)")
for (i in 1:2) {
lines(density(gapminder$lifeExp, bw = b[i]), col = i + 1, lwd = 2)
}
plot(density(gapminder$lifeExp, bw = 10), ylim = c(0, 0.05), main = "Densities with bandwidths = 10, 5, 3, 1",
lwd = 2, xlab = "Life Expectancy (yrs)")
for (i in 1:3) {
lines(density(gapminder$lifeExp, bw = b[i]), col = i + 1, lwd = 2)
}
plot(density(gapminder$lifeExp, bw = 10), ylim = c(0, 0.05), main = "Density with Different Bandwidths",
lwd = 2, xlab = "Life Expectancy (yrs)")
for (i in seq_along(b)) {
lines(density(gapminder$lifeExp, bw = b[i]), col = i + 1, lwd = 2)
}
legend("topleft", paste0("bw=", c(10, b)), fill = 1:5)
x3 <- data.frame(X1 = rnorm(10000, 0, 1), X2 = rnorm(10000, 5, 1), X3 = rnorm(10000,
0, 2), X4 = rchisq(10000, 2) - 2, X5 = rt(10000, df = 3))
plot(density(x3[, 1]), lwd = 2, xlim = c(-8, 17), xlab = "", main = "Kernel density estimates for five distributions",
sub = "")
for (j in (2:NCOL(x3))) lines(density(x3[, j]), col = j, lwd = 2)
Can we describe the differences between these five distributions using numerical summaries?
1> Location
2> Scale or Variability
3> Shape
\[ \bar{x}=\frac{\sum_{i=1}^{n} x_{i}}{n} \]
x <- 1:10
mean(x)
## [1] 5.5
median(x)
## [1] 5.5
x <- c(1:9, 100)
mean(x)
## [1] 14.5
median(x)
## [1] 5.5
plot(density(x3[, 1]), lwd = 2, xlim = c(-8, 17), xlab = "", main = "Kernel density estimates for five distributions",
sub = "")
for (j in (2:NCOL(x3))) lines(density(x3[, j]), col = j, lwd = 2)
legend("topright", paste0("Mean: ", round(colMeans(x3))), fill = 1:NCOL(x3))
Standard Deviation (SD): measures the spread, variability, or scale of a distribution.
It is defined as:
\[ s_{x}=\sqrt{\frac{\sum_{i=1}^{n}\left(x_{i}-\bar{x}\right)^{2}}{n-1}} \]
plot(density(x3[, 1]), lwd = 2, xlim = c(-8, 17), xlab = "", main = "Kernel density estimates for five distributions",
sub = "")
for (j in (2:NCOL(x3))) lines(density(x3[, j]), col = j, lwd = 2)
legend("topright", paste0("Mean: ", round(colMeans(x3)), "; SD: ", round(apply(x3,
2, sd))), fill = 1:NCOL(x3))
\[ \begin{array}{l}\bar{y}=a \bar{x}+b \\ \text { and } \\ s_{y}=|a| s_{x}\end{array} \]
\[ z=\frac{x-\bar{x}}{s_{x}} \]
\[ M A D_{x}=\operatorname{median}\left(\left|x_{i}-M_{x}\right|\right) \]
where \(M_x\) is the median of \(X=\left\{x_{1}, \ldots, x_{n}\right\}\)
Note that quantile vs. quartile.
Quartiles | Quantiles |
---|---|
First/lower quartile | 0.25 quantile |
Second quartile/median | 0.50 quantile |
Third/upper quartile | 0.75 quantile |
\[ \operatorname{median} \equiv\left\{\begin{array}{ll}x_{m+1}, & \text { if } n=2 m+1, \text { for some } m \geq 0 \\ \frac{1}{2}\left(x_{m}+x_{m+1}\right), & \text { if } n=2 m, \text { for some } m>0\end{array}\right. \]
x <- c(rnorm(100), c(-4, 5))
boxplot(x)
Comparing the distances between the quartiles and the median gives indication of the symmetry of the distribution.
boxplot(x3, col = 1:NCOL(x3), main = "Boxplots for Five Distributions")
x <- rnorm(100)
y <- rgamma(100, shape = 0.5)
z <- c(rnorm(50), rnorm(50, mean = 5))
par(mfrow = c(2, 3))
boxplot(x, main = "Normally distributed")
boxplot(y, main = "Skewed")
boxplot(z, main = "Bimodal")
hist(x, probability = TRUE, main = "")
lines(density(x), col = 2, lwd = 2)
hist(y, probability = TRUE, main = "")
lines(density(y), col = 2, lwd = 2)
hist(z, probability = TRUE, main = "")
lines(density(z), col = 2, lwd = 2)
par(mfrow = c(2, 3))
for (i in seq_len(NCOL(x3))) {
qqnorm(x3[, i], pch = 20, col = i)
abline(0, 1)
}
gapminder %>%
ggplot(aes(x = log(gdpPercap), y = lifeExp)) + geom_point() + theme_classic() +
ggtitle("Relation between life expectancy and log of GDP per capita (all years)")
gapminder %>%
ggplot(aes(x = log(gdpPercap), y = lifeExp)) + geom_density2d() + theme_classic() +
ggtitle("Relation between life expectancy and log of GDP per capita (all years)")
\[ r=\operatorname{Cor}(x, y)=\frac{1}{n} \sum_{i=1}^{n}\left(\frac{x_{i}-\bar{x}}{s_{x}}\right)\left(\frac{y_{i}-\bar{y}}{s_{y}}\right) \]
suppressPackageStartupMessages(library(MASS))
## Warning: package 'MASS' was built under R version 4.0.5
par(mfrow = c(2, 2))
x <- mvrnorm(100, c(0, 0), matrix(c(1, 0, 0, 1), 2, 2, byrow = TRUE))
plot(x, xlab = "x", ylab = "y", main = paste("Cor[x,y] = ", round(cor(x[, 1], x[,
2]), 2), sep = ""))
x <- mvrnorm(100, c(0, 0), matrix(c(1, -0.75, -0.75, 1), 2, 2, byrow = TRUE))
plot(x, xlab = "x", ylab = "y", main = paste("Cor[x,y] = ", round(cor(x[, 1], x[,
2]), 2), sep = ""))
x <- mvrnorm(100, c(0, 0), matrix(c(1, 0.75, 0.75, 1), 2, 2, byrow = TRUE))
plot(x, xlab = "x", ylab = "y", main = paste("Cor[x,y] = ", round(cor(x[, 1], x[,
2]), 2), sep = ""))
x <- mvrnorm(100, c(0, 0), matrix(c(1, 0.99, 0.99, 1), 2, 2, byrow = TRUE))
plot(x, xlab = "x", ylab = "y", main = paste("Cor[x,y] = ", round(cor(x[, 1], x[,
2]), 2), sep = ""))
par(mfrow = c(2, 2))
x <- mvrnorm(100, c(0, 0), matrix(c(1, 0, 0, 1), 2, 2, byrow = TRUE))
boxplot(data.frame(x), main = paste("Cor[x1,x2] = ", round(cor(x[, 1], x[, 2]), 2),
sep = ""))
x <- mvrnorm(100, c(0, 0), matrix(c(1, -0.75, -0.75, 1), 2, 2, byrow = TRUE))
boxplot(data.frame(x), main = paste("Cor[x1,x2] = ", round(cor(x[, 1], x[, 2]), 2),
sep = ""))
x <- mvrnorm(100, c(0, 0), matrix(c(1, 0.75, 0.75, 1), 2, 2, byrow = TRUE))
boxplot(data.frame(x), main = paste("Cor[x1,x2] = ", round(cor(x[, 1], x[, 2]), 2),
sep = ""))
x <- mvrnorm(100, c(0, 0), matrix(c(1, 0.99, 0.99, 1), 2, 2, byrow = TRUE))
boxplot(data.frame(x), main = paste("Cor[x1,x2] = ", round(cor(x[, 1], x[, 2]), 2),
sep = ""))
x <- rnorm(500)
y <- -x^2 + rnorm(500, 0, 0.5)
plot(x, y, main = paste("Cor[x,y] = ", round(cor(x, y), 2), sep = ""))
abline(lm(y ~ x)[[1]], col = 2, lwd = 2)
lines(lowess(y ~ x, f = 0.2), col = 3, lwd = 2)
legend("bottom", c("Ordinary linear regression", "Robust local regression"), col = 2:3,
lwd = 2)
- 만약 두 변수 x, y간 non-linear relationship이 있는 경우, x, y의 상관계수를 측정했을 때 0이 나올 수 있음
- 이는 x, y가 관계를 가지고 있는 것이 분명해도, 그 관계가 ’linear’이 아니가 때문임
- 상관계수는 두 변수의 “linear relationship”만을 알려줌. 따라서, x, y 간의 relationship이 있더라도 그 관계가 선형이 아니라면, 상관계수로 두 변수 x, y의 relationship을 제대로 측정할 수 없음.
lifeExp
,
pop
, and gdpPercap
in the gapminder data, we
can use the R
function cor()
.cor(gapminder[, 4:6])
## lifeExp pop gdpPercap
## lifeExp 1.00000000 0.06495537 0.58370622
## pop 0.06495537 1.00000000 -0.02559958
## gdpPercap 0.58370622 -0.02559958 1.00000000
library(ellipse)
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
##
## pairs
plotcorr(r)
# Install bioconductor packages
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install("golubEsets")
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
BiocManager::install("hu6800.db")
suppressPackageStartupMessages(library(golubEsets))
## Warning: package 'BiocGenerics' was built under R version 4.0.5
suppressPackageStartupMessages(library(hu6800.db))
data(Golub_Train)
golub <- exprs(Golub_Train)
golub[golub < 100] <- 100
golub[golub > 16000] <- 16000
golub <- log2(golub)
AffyID <- mappedkeys(hu6800SYMBOL)
symbols <- links(hu6800SYMBOL[AffyID])
indZYX <- grep("zyx", symbols[, 2], ignore.case = TRUE)
indNUP <- grep("NUP98", symbols[, 2], ignore.case = TRUE)
symbols5 <- links(hu6800SYMBOL[c(symbols[c(indZYX[1], indNUP[1]), 1], c("AB004884_at",
"A28102_at", "AC000064_cds1_at"))])
golub5 <- t(golub[symbols5[, 1], ])
colnames(golub5) <- symbols5[, 2]
boxplot(golub5, ylab = "Gene expression")
library(ggridges)
library(tidyr)
library(ggplot2)
golub_tall <- gather(data.frame(golub5), key = "Gene", value = "Expression")
ggplot(golub_tall, aes(x = Expression, y = Gene, group = Gene, fill = Gene)) + geom_density_ridges() +
theme_classic() + scale_fill_brewer(palette = "Set3")
## Picking joint bandwidth of 0.392
library(pheatmap)
pheatmap(t(golub5))