--- title: "EDA wykłady" output: html_document: default --- ##PCA ###Results of the olympic heptathlon competition, Seoul, 1988. A scoring system is used to assign points to the results from each event and the winner is the woman who accumulates the most points over the two days. The event made its first Olympic appearance in 1984. hurdles = 100 m płotki shot = pchnięcie kulą javelin = oszczep ```{r} load("siedmioboj.Rda") hm <- as.matrix(siedmioboj[,1:7]) hm[1:5,] kor <- round(cor(hm), 2) kor ``` ```{r, PCA} siedmioboj_pca <- prcomp(hm, scale = TRUE) lambdy <- siedmioboj_pca$sdev^2 (round(siedmioboj_pca$rotation,4)) round(lambdy,4) sum(lambdy) round(cumsum(lambdy)/7,2) ``` ```{r,wyniki} center <- siedmioboj_pca$center scale <- siedmioboj_pca$scale drop(scale(hm, center = center, scale = scale) %*% siedmioboj_pca$rotation[,1]) #predict predict(siedmioboj_pca)[,1] (punkty.pca <- predict(siedmioboj_pca)[,1]) wykres <- data.frame(pca=punkty.pca,score=siedmioboj$score) ``` ```{r,echo=FALSE} require(ggplot2) siedm.lin <- lm(siedmioboj$score~punkty.pca) siedm.sm <- summary(siedm.lin) r2 <- siedm.sm$r.squared b <- siedm.lin$coefficients[1] a <- siedm.lin$coefficients[2] eq <- paste0("score = ", round(a,2), "*pca + ", round(b,2)," r^2 = ",round(r2,3)) sp <- ggplot(data=wykres, aes(x=pca, y=score)) + geom_point() print(sp + geom_abline(intercept = b, slope =a, color="green")+ ggtitle(eq)) ``` ```{r} pc2 <- as.data.frame(predict(siedmioboj_pca)[,1:2]) ggplot(data = pc2, aes(x = PC1, y = PC2, label = rownames(pc2))) + geom_hline(yintercept = 0, colour = "gray65") + geom_vline(xintercept = 0, colour = "gray65") + geom_text(colour = "tomato", alpha = 0.8, size = 3) + geom_point() ``` ```{r,kolo korelacyjne} require(FactoMineR) pca.FM <- PCA(hm, graph = FALSE) round(pca.FM$var$coord,4) ``` ```{r , wykres koła korelacyjnego,echo=FALSE} # function to create a circle circle <- function(center = c(0, 0), npoints = 100) { r = 1 tt = seq(0, 2 * pi, length = npoints) xx = center[1] + r * cos(tt) yy = center[1] + r * sin(tt) return(data.frame(x = xx, y = yy)) } corcir = circle(c(0, 0), npoints = 100) # create data frame with correlations between variables and PCs correlations = as.data.frame(pca.FM$var$coord) # data frame with arrows coordinates arrows = data.frame(x1 = rep(0,nrow(correlations)), y1 = rep(0,nrow(correlations)), x2 = correlations$Dim.1, y2 = correlations$Dim.2) # geom_path will do open circles ggplot() + geom_path(data = corcir, aes(x = x, y = y), colour = "gray65") + geom_segment(data = arrows, aes(x = x1, y = y1, xend = x2, yend = y2), colour = "gray65") + geom_text(data = correlations, aes(x = Dim.1, y = Dim.2, label = rownames(correlations))) + geom_hline(yintercept = 0, colour = "gray65") + geom_vline(xintercept = 0, colour = "gray65") + xlim(-1.1, 1.1) + ylim(-1.1, 1.1) + labs(x = "pc1", y = "pc2") + ggtitle("Koło korelacyjne") + coord_fixed() ``` ##Regresja ```{r} hm.df <- as.data.frame(hm) model <- lm(highjump~longjump+hurdles,data=hm.df) summary(model) model.1 <- lm(highjump~longjump,data=hm.df) summary(model.1) ``` ```{r} predict(model,newdata = hm.df[1:3,]) model$fitted.values[1:3] hm.df[1:3,2] ``` ```{r,echo=FALSE} ggplot(data = data.frame(prognozy=model$fitted.values,wartosci=hm.df$highjump), aes(x =wartosci , y = prognozy, label = rownames(hm.df))) + geom_abline(intercept = 0, slope=1,colour = "gray65") + geom_text(colour = "tomato", alpha = 0.8, size = 3) + geom_point() ``` ##Korelacja Kanoniczna https://core.ac.uk/download/pdf/6303071.pdf ```{r} biegi <- hm[,c(1,4,7)] sila <- hm[,c(2,3,5,6)] biegi sila ``` ```{r,echo=FALSE, include=FALSE} require(CCA) ``` ```{r} correl <- matcor(biegi, sila) round(correl$Xcor,4) round(correl$Ycor,4) round(correl$XYcor,4) img.matcor(correl, type = 2) ``` ```{r} siedmioboj.cc <- cc(biegi,sila) siedmioboj.cc ``` ```{r,echo=FALSE} ggplot(data = data.frame(biegi=-siedmioboj.cc$scores$xscores[,1],sila=-siedmioboj.cc$scores$yscores[,1]), aes(x =biegi , y = sila, label = rownames(hm.df))) + geom_abline(intercept = 0, slope=1,colour = "gray65") + geom_text(colour = "tomato", alpha = 0.8, size = 3) + geom_point() ``` ```{r} barplot(siedmioboj.cc$cor , xlab = "wymiar", ylab = "Korelacje kanonicznes", names.arg = 1:3, ylim = c(0,1)) plt.cc(siedmioboj.cc,type="v",var.label=TRUE) ``` ##Grupowanie (analiza skupień, *cluster analysis*) ```{r,SrodekCiezkosci,echo=FALSE} SrodekCiezkosci <- function(dane){ g <- apply(dane,MARGIN = 2,mean) return(list(g = g,n = nrow(dane))) } ``` ```{r,przykład dane,echo=FALSE} x <- c(1,2,3,3,4,5,1,2,3,4,3,2) x <- array(x,dim =c(6,2)) SrodekCiezkosci(x) x.df <- as.data.frame(x) x.df x.df[7, ] <- SrodekCiezkosci(x)$g x.df[,3] <- c(rep("dane",6),"środek ciężkości") names(x.df)[3] <- "typ" require(ggplot2) ggplot(data=x.df,aes(x=V1,y=V2))+geom_point(aes(colour=typ),size=3) ``` ```{r,bezwładności,echo=FALSE} dane <- c(1,2,3,3,4,5,1,2,3,4,3,2) dane <- array(x,dim =c(6,2)) srodki_ciezkosci <- function(dane, podzial){ require(dplyr) k <- max(podzial) nc <- ncol(dane) gg <- numeric(k*(nc+1)) gg <- array(gg,dim=c(k,nc+1)) gg <- as.data.frame(gg) for (i in 1:k) { gg_rob<- dane %>% as.data.frame() %>% filter(podzial == i) %>% SrodekCiezkosci() gg[i,1:nc] <- gg_rob$g gg[i,(nc+1)] <- gg_rob$n } return(gg) } JM <- function(dane, podzial){ gg <- srodki_ciezkosci(dane, podzial) g <- SrodekCiezkosci(dane)$g jm <- 0 for (j in 1: (ncol(gg)-1)) jm <- jm + gg$V3 %*%(gg[,j]-g[j])^2 jm <- as.numeric(jm/nrow(dane)) return(jm) } podzial1 <- c(2,2,2,2,1,1) podzial2 <- c(1,2,1,2,2,1) podzial3 <- c(1,1,2,2,2,2) dane1 <- cbind(as.data.frame(dane),podzial=as.factor(podzial1)) ggplot(data=dane1,aes(x=V1,y=V2))+geom_point(aes(colour=podzial),size=3) + labs(title="Podział 1") srodki_ciezkosci(dane, podzial1) JM(dane, podzial1) dane2 <- cbind(as.data.frame(dane),podzial=as.factor(podzial2)) ggplot(data=dane2,aes(x=V1,y=V2))+geom_point(aes(colour=podzial),size=3) + labs(title="Podział 2") srodki_ciezkosci(dane, podzial2) JM(dane, podzial2) dane3 <- cbind(as.data.frame(dane),podzial=as.factor(podzial3)) ggplot(data=dane3,aes(x=V1,y=V2))+geom_point(aes(colour=podzial),size=3) + labs(title="Podział 3") srodki_ciezkosci(dane, podzial3) JM(dane, podzial3) ``` ###Komórki Woronoja ```{r} woronoj <- function(x,y) { require(deldir) require(ggplot2) df <- data.frame(x,y) #granice Woronoja voronoi <- deldir(x, y) #Wykres gr <- ggplot(data=df, aes(x=x,y=y)) gr_wyn <- gr+geom_point (size=2,color="red",alpha=0.5)+ #Plot the voronoi lines geom_segment( aes(x = x1, y = y1, xend = x2, yend = y2), size = 2, data = voronoi$dirsgs, linetype = 1, color= "#FFB958") return(gr_wyn) } set.seed(105) long<-rnorm(20,-98,15) lat<-rnorm(20,39,10) df <- data.frame(lat,long) woronoj(df$long[c(1,3,4)],df$lat[c(1,3,4)]) woronoj(df$long,df$lat) woronojPunkty <- function(dane,centra){ require(deldir) require(ggplot2) voronoi <- deldir(centra[,1], centra[,2]) gr <- ggplot(data=dane, aes(x=dane[,1],y=dane[,2])) gr_wyn <- gr+geom_point ()+ geom_point(data=centra,aes(x=centra[,1],y=centra[,2]),size=4,color="orange",alpha=0.4)+ #Plot the voronoi lines geom_segment( aes(x = x1, y = y1, xend = x2, yend = y2), size = 2, data = voronoi$dirsgs, linetype = 1, color= "#FFB958") return(gr_wyn) } dane <- c(1,2,3,3,4,5,1,2,3,4,3,2) dane <- array(dane,dim =c(6,2)) dane <- as.data.frame(dane) centra <- data.frame(x=c(2,4),y=c(3,3.5)) centra <- as.data.frame(rbind(c(2,3),c(4,3.5))) woronojPunkty(dane,centra) centra <- as.data.frame(rbind(c(2,3),c(4,3.5),c(3.5,2))) woronojPunkty(dane,centra) ``` ```{r} dminCentrum <- function(punkt,centra){ k <- nrow(centra) k_min <- -1 d_min <- Inf for (i in 1:k) { d <- dist(rbind(punkt, centra[i,])) if (d Badano zmiany zawartej w plazmie krwi stężenia glukozy [%] (zmienna 1) i wolnego kwasu tłuszczowego [mEq/l] u 12 schizofreników (grupa 1) i 13 zdrowych ochotników (grupa 2) po domięśniowym wstrzyknięciu insuliny. **Dane** Środki ciężkości ```{r,echo=FALSE} g1 <- c(-25.6,-0.06) g2 <- c(-31.1,-0.15) g1 g2 ``` Macierze kowariancji ```{r,echo=FALSE} V1 <- matrix(c(278.083,0.8291,0.8291,0.0092),nrow=2,ncol=2) V2 <- matrix(c(269.923,-0.2493,-0.2493,0.0067),nrow=2,ncol=2) V1 V2 ``` Obliczenia ```{r} p1 <- 12/25 p2 <- 13/25 g <- p1*g1+p2*g2 g VW <- p1*V1+p2*V2 VW ``` ```{r} VM <- p1*p2*(g1-g2)%*%t(g1-g2) VM ``` ```{r} V <- VW+VM V Vinv <- solve(V) Vinv u0 <- Vinv %*% (g1-g2) u0 ``` Punkt podziału ```{r} c <- 0.5 * sum(t(u0) * (g1+g2)) c ``` Reguła 100 ```{r} ug <- 100*u0/c ug ``` Dyskryminacja ```{r} x <- c(-30,-0.1) x sum(t(ug) * x) ``` Chory ```{r} x <- c(-20,-0.2) x sum(t(ug) * x) ``` Zdrowy ###Analiza Korespondencji ```{r} library(ca) ``` ```{r} t <- matrix(c(1,1,1,1,1,0),nrow=2,ncol=3) t <- as.table(t) rownames(t) <- c("K","M") colnames(t) <- c("W","P","A") t ``` ```{r} haireye <- margin.table(HairEyeColor, 1:2) print(haireye) ``` ```{r} cat("wiersze","\n") t.ca <- ca(t) t.ca$rowmass t.ca$rowcoord cat("\n","kolumny","\n") t.ca$colmass t.ca$colcoord ``` ```{r} haireye.ca <- ca(haireye) cat("wiersze","\n") haireye.ca$rowmass haireye.ca$rowcoord cat("\n","kolumny","\n") haireye.ca$colmass haireye.ca$colcoord plot(haireye.ca) # some plot options plot(haireye.ca, lines=TRUE) plot(haireye.ca, arrows=c(TRUE, FALSE)) ```