---
title: "EDA wykłady"
output:
html_document: default
---
##PCA
###Results of the olympic heptathlon competition, Seoul, 1988.
A scoring system is used to assign points to the results from each event and the winner is the woman who accumulates the most points over the two days. The event made its first Olympic appearance in 1984.
hurdles = 100 m płotki
shot = pchnięcie kulą
javelin = oszczep
```{r}
load("siedmioboj.Rda")
hm <- as.matrix(siedmioboj[,1:7])
hm[1:5,]
kor <- round(cor(hm), 2)
kor
```
```{r, PCA}
siedmioboj_pca <- prcomp(hm, scale = TRUE)
lambdy <- siedmioboj_pca$sdev^2
(round(siedmioboj_pca$rotation,4))
round(lambdy,4)
sum(lambdy)
round(cumsum(lambdy)/7,2)
```
```{r,wyniki}
center <- siedmioboj_pca$center
scale <- siedmioboj_pca$scale
drop(scale(hm, center = center, scale = scale) %*%
siedmioboj_pca$rotation[,1])
#predict
predict(siedmioboj_pca)[,1]
(punkty.pca <- predict(siedmioboj_pca)[,1])
wykres <- data.frame(pca=punkty.pca,score=siedmioboj$score)
```
```{r,echo=FALSE}
require(ggplot2)
siedm.lin <- lm(siedmioboj$score~punkty.pca)
siedm.sm <- summary(siedm.lin)
r2 <- siedm.sm$r.squared
b <- siedm.lin$coefficients[1]
a <- siedm.lin$coefficients[2]
eq <- paste0("score = ", round(a,2), "*pca + ", round(b,2)," r^2 = ",round(r2,3))
sp <- ggplot(data=wykres, aes(x=pca, y=score)) + geom_point()
print(sp + geom_abline(intercept = b, slope =a, color="green")+
ggtitle(eq))
```
```{r}
pc2 <- as.data.frame(predict(siedmioboj_pca)[,1:2])
ggplot(data = pc2, aes(x = PC1, y = PC2, label = rownames(pc2))) + geom_hline(yintercept = 0, colour = "gray65") + geom_vline(xintercept = 0, colour = "gray65") + geom_text(colour = "tomato", alpha = 0.8, size = 3) + geom_point()
```
```{r,kolo korelacyjne}
require(FactoMineR)
pca.FM <- PCA(hm, graph = FALSE)
round(pca.FM$var$coord,4)
```
```{r , wykres koła korelacyjnego,echo=FALSE}
# function to create a circle
circle <- function(center = c(0, 0), npoints = 100) {
r = 1
tt = seq(0, 2 * pi, length = npoints)
xx = center[1] + r * cos(tt)
yy = center[1] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
corcir = circle(c(0, 0), npoints = 100)
# create data frame with correlations between variables and PCs
correlations = as.data.frame(pca.FM$var$coord)
# data frame with arrows coordinates
arrows = data.frame(x1 = rep(0,nrow(correlations)), y1 = rep(0,nrow(correlations)), x2 = correlations$Dim.1,
y2 = correlations$Dim.2)
# geom_path will do open circles
ggplot() + geom_path(data = corcir, aes(x = x, y = y), colour = "gray65") +
geom_segment(data = arrows, aes(x = x1, y = y1, xend = x2, yend = y2), colour = "gray65") +
geom_text(data = correlations, aes(x = Dim.1, y = Dim.2, label = rownames(correlations))) +
geom_hline(yintercept = 0, colour = "gray65") + geom_vline(xintercept = 0,
colour = "gray65") + xlim(-1.1, 1.1) + ylim(-1.1, 1.1) + labs(x = "pc1",
y = "pc2") + ggtitle("Koło korelacyjne") + coord_fixed()
```
##Regresja
```{r}
hm.df <- as.data.frame(hm)
model <- lm(highjump~longjump+hurdles,data=hm.df)
summary(model)
model.1 <- lm(highjump~longjump,data=hm.df)
summary(model.1)
```
```{r}
predict(model,newdata = hm.df[1:3,])
model$fitted.values[1:3]
hm.df[1:3,2]
```
```{r,echo=FALSE}
ggplot(data = data.frame(prognozy=model$fitted.values,wartosci=hm.df$highjump), aes(x =wartosci , y = prognozy, label = rownames(hm.df))) + geom_abline(intercept = 0, slope=1,colour = "gray65") + geom_text(colour = "tomato", alpha = 0.8, size = 3) + geom_point()
```
##Korelacja Kanoniczna
https://core.ac.uk/download/pdf/6303071.pdf
```{r}
biegi <- hm[,c(1,4,7)]
sila <- hm[,c(2,3,5,6)]
biegi
sila
```
```{r,echo=FALSE, include=FALSE}
require(CCA)
```
```{r}
correl <- matcor(biegi, sila)
round(correl$Xcor,4)
round(correl$Ycor,4)
round(correl$XYcor,4)
img.matcor(correl, type = 2)
```
```{r}
siedmioboj.cc <- cc(biegi,sila)
siedmioboj.cc
```
```{r,echo=FALSE}
ggplot(data = data.frame(biegi=-siedmioboj.cc$scores$xscores[,1],sila=-siedmioboj.cc$scores$yscores[,1]), aes(x =biegi , y = sila, label = rownames(hm.df))) + geom_abline(intercept = 0, slope=1,colour = "gray65") + geom_text(colour = "tomato", alpha = 0.8, size = 3) + geom_point()
```
```{r}
barplot(siedmioboj.cc$cor , xlab = "wymiar", ylab = "Korelacje kanonicznes", names.arg = 1:3, ylim = c(0,1))
plt.cc(siedmioboj.cc,type="v",var.label=TRUE)
```
##Grupowanie (analiza skupień, *cluster analysis*)
```{r,SrodekCiezkosci,echo=FALSE}
SrodekCiezkosci <- function(dane){
g <- apply(dane,MARGIN = 2,mean)
return(list(g = g,n = nrow(dane)))
}
```
```{r,przykład dane,echo=FALSE}
x <- c(1,2,3,3,4,5,1,2,3,4,3,2)
x <- array(x,dim =c(6,2))
SrodekCiezkosci(x)
x.df <- as.data.frame(x)
x.df
x.df[7, ] <- SrodekCiezkosci(x)$g
x.df[,3] <- c(rep("dane",6),"środek ciężkości")
names(x.df)[3] <- "typ"
require(ggplot2)
ggplot(data=x.df,aes(x=V1,y=V2))+geom_point(aes(colour=typ),size=3)
```
```{r,bezwładności,echo=FALSE}
dane <- c(1,2,3,3,4,5,1,2,3,4,3,2)
dane <- array(x,dim =c(6,2))
srodki_ciezkosci <- function(dane, podzial){
require(dplyr)
k <- max(podzial)
nc <- ncol(dane)
gg <- numeric(k*(nc+1))
gg <- array(gg,dim=c(k,nc+1))
gg <- as.data.frame(gg)
for (i in 1:k) {
gg_rob<- dane %>%
as.data.frame() %>%
filter(podzial == i) %>%
SrodekCiezkosci()
gg[i,1:nc] <- gg_rob$g
gg[i,(nc+1)] <- gg_rob$n
}
return(gg)
}
JM <- function(dane, podzial){
gg <- srodki_ciezkosci(dane, podzial)
g <- SrodekCiezkosci(dane)$g
jm <- 0
for (j in 1: (ncol(gg)-1))
jm <- jm + gg$V3 %*%(gg[,j]-g[j])^2
jm <- as.numeric(jm/nrow(dane))
return(jm)
}
podzial1 <- c(2,2,2,2,1,1)
podzial2 <- c(1,2,1,2,2,1)
podzial3 <- c(1,1,2,2,2,2)
dane1 <- cbind(as.data.frame(dane),podzial=as.factor(podzial1))
ggplot(data=dane1,aes(x=V1,y=V2))+geom_point(aes(colour=podzial),size=3) +
labs(title="Podział 1")
srodki_ciezkosci(dane, podzial1)
JM(dane, podzial1)
dane2 <- cbind(as.data.frame(dane),podzial=as.factor(podzial2))
ggplot(data=dane2,aes(x=V1,y=V2))+geom_point(aes(colour=podzial),size=3) +
labs(title="Podział 2")
srodki_ciezkosci(dane, podzial2)
JM(dane, podzial2)
dane3 <- cbind(as.data.frame(dane),podzial=as.factor(podzial3))
ggplot(data=dane3,aes(x=V1,y=V2))+geom_point(aes(colour=podzial),size=3) +
labs(title="Podział 3")
srodki_ciezkosci(dane, podzial3)
JM(dane, podzial3)
```
###Komórki Woronoja
```{r}
woronoj <- function(x,y) {
require(deldir)
require(ggplot2)
df <- data.frame(x,y)
#granice Woronoja
voronoi <- deldir(x, y)
#Wykres
gr <- ggplot(data=df, aes(x=x,y=y))
gr_wyn <- gr+geom_point (size=2,color="red",alpha=0.5)+
#Plot the voronoi lines
geom_segment(
aes(x = x1, y = y1, xend = x2, yend = y2),
size = 2,
data = voronoi$dirsgs,
linetype = 1,
color= "#FFB958")
return(gr_wyn)
}
set.seed(105)
long<-rnorm(20,-98,15)
lat<-rnorm(20,39,10)
df <- data.frame(lat,long)
woronoj(df$long[c(1,3,4)],df$lat[c(1,3,4)])
woronoj(df$long,df$lat)
woronojPunkty <- function(dane,centra){
require(deldir)
require(ggplot2)
voronoi <- deldir(centra[,1], centra[,2])
gr <- ggplot(data=dane, aes(x=dane[,1],y=dane[,2]))
gr_wyn <- gr+geom_point ()+
geom_point(data=centra,aes(x=centra[,1],y=centra[,2]),size=4,color="orange",alpha=0.4)+
#Plot the voronoi lines
geom_segment(
aes(x = x1, y = y1, xend = x2, yend = y2),
size = 2,
data = voronoi$dirsgs,
linetype = 1,
color= "#FFB958")
return(gr_wyn)
}
dane <- c(1,2,3,3,4,5,1,2,3,4,3,2)
dane <- array(dane,dim =c(6,2))
dane <- as.data.frame(dane)
centra <- data.frame(x=c(2,4),y=c(3,3.5))
centra <- as.data.frame(rbind(c(2,3),c(4,3.5)))
woronojPunkty(dane,centra)
centra <- as.data.frame(rbind(c(2,3),c(4,3.5),c(3.5,2)))
woronojPunkty(dane,centra)
```
```{r}
dminCentrum <- function(punkt,centra){
k <- nrow(centra)
k_min <- -1
d_min <- Inf
for (i in 1:k) {
d <- dist(rbind(punkt, centra[i,]))
if (d Badano zmiany zawartej w plazmie krwi stężenia glukozy [%] (zmienna 1) i wolnego kwasu tłuszczowego [mEq/l] u 12 schizofreników (grupa 1) i 13 zdrowych ochotników (grupa 2) po domięśniowym wstrzyknięciu insuliny.
**Dane**
Środki ciężkości
```{r,echo=FALSE}
g1 <- c(-25.6,-0.06)
g2 <- c(-31.1,-0.15)
g1
g2
```
Macierze kowariancji
```{r,echo=FALSE}
V1 <- matrix(c(278.083,0.8291,0.8291,0.0092),nrow=2,ncol=2)
V2 <- matrix(c(269.923,-0.2493,-0.2493,0.0067),nrow=2,ncol=2)
V1
V2
```
Obliczenia
```{r}
p1 <- 12/25
p2 <- 13/25
g <- p1*g1+p2*g2
g
VW <- p1*V1+p2*V2
VW
```
```{r}
VM <- p1*p2*(g1-g2)%*%t(g1-g2)
VM
```
```{r}
V <- VW+VM
V
Vinv <- solve(V)
Vinv
u0 <- Vinv %*% (g1-g2)
u0
```
Punkt podziału
```{r}
c <- 0.5 * sum(t(u0) * (g1+g2))
c
```
Reguła 100
```{r}
ug <- 100*u0/c
ug
```
Dyskryminacja
```{r}
x <- c(-30,-0.1)
x
sum(t(ug) * x)
```
Chory
```{r}
x <- c(-20,-0.2)
x
sum(t(ug) * x)
```
Zdrowy
###Analiza Korespondencji
```{r}
library(ca)
```
```{r}
t <- matrix(c(1,1,1,1,1,0),nrow=2,ncol=3)
t <- as.table(t)
rownames(t) <- c("K","M")
colnames(t) <- c("W","P","A")
t
```
```{r}
haireye <- margin.table(HairEyeColor, 1:2)
print(haireye)
```
```{r}
cat("wiersze","\n")
t.ca <- ca(t)
t.ca$rowmass
t.ca$rowcoord
cat("\n","kolumny","\n")
t.ca$colmass
t.ca$colcoord
```
```{r}
haireye.ca <- ca(haireye)
cat("wiersze","\n")
haireye.ca$rowmass
haireye.ca$rowcoord
cat("\n","kolumny","\n")
haireye.ca$colmass
haireye.ca$colcoord
plot(haireye.ca)
# some plot options
plot(haireye.ca, lines=TRUE)
plot(haireye.ca, arrows=c(TRUE, FALSE))
```