# qq éléments de correction du TP stats multivariées

## exo 1 "Présidentielles - AFC" ##

presid <- read.csv(paste("PATH-TO/Presidentielle.CSVPresidentielleREMOVETXT.CSV.TXT", sep = ""), row.names = 1)

summary(presid)

library(graphics)
library(vcd)

# plusieurs réprésentations graphiques des données avec interprétations d'écart à une indépendance région/candidat pr certaines
presidtable <- as.table(as.matrix(presid))
assoc(presidtable, shade = T, las = 2, main = "Associations et residus du test du chi2", labeling_args = list(abbreviate = c(A=TRUE), varnames = F, rot_labels = 90))
mosaicplot(presidtable, type = "pearson", shade = T, las = 2, main = "Associations et residus du test du chi2")
library(ade4)
table.value(presidtable, grid = T, col.labels = names(presid))

# L'AFC mise en oeuvre grâce à une fonction toute faite R
library(FactoMineR)
res.ca <- CA(presid, graph = FALSE) # AFC en français, CA en anglais...

barplot(res.ca$eig$per, ylab = "Inertie expliquee (%)", xlab = "Composante")
barplot(res.ca$eig$cum, ylab = "Inertie cumulee expliquee (%)",xlab = "Composante")
abline(h = 80, lty = 2, lwd = 2)

res.ca$call$marge.row
res.ca$call$marge.col

# représentation des 1er et 2nd plans factoriels
plot(res.ca, cex = 0.8)
plot(res.ca, axes = c(3, 4), cex = 0.8)
# on peut voir la qualité de ces représentations en étudiant
res.ca$col$coord
res.ca$col$cos2
res.ca$col$contrib
res.ca$row$coord
res.ca$row$cos2
res.ca$row$contrib
# Ou alors utiliser directement
dimdesc(res.ca)

# exo "Clustering-hôtels"

hotels <- read.csv("PATH-TO/HotelsREMOVETXT.CSV.TXT",row.names = 1)

palette(rainbow(12, s = 0.6, v = 0.75))
stars(hotels, key.loc = c(14.5, 2), draw.segments = T, main = "Diagramme en etoile des hotels")
palette("default")

library(cluster)
hotelsnum <- hotels[, -1]
# je ne presente pas toutes les metriques possibles ni toutes les criteres de distance moyenne/min/max/...
#Classif sur les individus (=les hotels)
res.cash <- agnes(hotelsnum, metric = "euclidean", method = "single")
split(rownames(hotelsnum), cutree(res.cash, k = 3))

res.dendro <- as.dendrogram(as.hclust(res.cash))
plot(res.dendro, horiz = TRUE, center = TRUE, nodePar = list(lab.cex = 0.6, lab.col = "darkblue", pch = NA), main = deparse(res.cash$call))

#Classif pr les variables
thotelsnum <- t(hotelsnum)
res.cash <- agnes(thotelsnum, metric = "euclidean", method = "single")
split(rownames(thotelsnum), cutree(res.cash, k = 3))
res.dendro <- as.dendrogram(as.hclust(res.cash))
plot(res.dendro, horiz = TRUE, center = TRUE, nodePar = list(lab.cex = 0.6, lab.col = "darkblue", pch = NA), main = deparse(res.cash$call))

# classification en utilisant l'algo k-means avec plusieurs initialisations
clhotel <- kmeans(hotelsnum, 3, nstart = 50)
colhotnum <- cbind(factor(clhotel$cluster), hotelsnum)
colcennum <- cbind(factor(1:3), clhotel$centers)
colnames(colhotnum) <- c("CLUSTERS", colnames(hotelsnum))
colnames(colcennum) <- c("CLUSTERS", colnames(hotelsnum))
datas <- rbind(colcennum, colhotnum)
datas$CLUSTERS <- factor(datas$CLUSTERS)
plot(hotelsnum, col = colhotnum$CLUSTERS)

library(FactoMineR)
res.pca <- PCA(datas, graph = FALSE, quali.sup = 1, ind.sup = 1:3)
plot(res.pca, habillage = 1, new.plot = FALSE, cex = 0.8)
plot(res.pca, axes = c(3, 4), habillage = 1, new.plot = FALSE, cex = 0.8)

# classification sur les donnees des 4 premiers axes factoriels
clhotel2 <- kmeans(res.pca$ind$coord[, 1:4], 3, nstart = 50)
colhotnum2 <- data.frame(cbind(factor(clhotel2$cluster), hotelsnum))
colnames(colhotnum2) <- c("CLUSTERS", colnames(hotelsnum))
datas2 <- colhotnum2
datas2$CLUSTERS <- factor(datas2$CLUSTERS)
plot(hotelsnum, col = colhotnum2$CLUSTERS)
res.pca2 <- PCA(datas2, graph = FALSE, quali.sup = 1)
plot(res.pca2, habillage = 1, new.plot = FALSE, cex = 0.8)
plot(res.pca2, axes = c(3, 4), habillage = 1, new.plot = FALSE, cex = 0.8)

layout(1:2)
plot(PCA(datas, graph = FALSE, quali.sup = 1, ind.sup = 1:3), habillage = 1, new.plot = FALSE, cex = 0.8)
plot(PCA(datas2, graph = FALSE, quali.sup = 1), habillage = 1, new.plot = FALSE, cex = 0.8)
layout(1)
layout(1:2)
plot(PCA(datas, graph = FALSE, quali.sup = 1, ind.sup = 1:3), habillage = 1, new.plot = FALSE, cex = 0.8)
plot(PCA(datas2, graph = FALSE, quali.sup = 1), habillage = 1, new.plot = FALSE, cex = 0.8)
layout(1)

# Ici on se passe des etoiles et on voit si on retrouve cette classif a partir des autres variables
hotelsnum2 <- hotelsnum[, -7]
clhotel3 <- kmeans(hotelsnum2, 6, nstart = 50)
colhotnum3 <- cbind(factor(clhotel3$cluster), factor(hotels$ETOILE), hotelsnum2)
colnames(colhotnum3) <- c("CLUSTERS", "ETOILE_Q", colnames(hotelsnum2))
datas3 <- colhotnum3
datas3$CLUSTERS <- factor(datas3$CLUSTERS)
plot(hotelsnum2, col = colhotnum3$CLUSTERS)
cbind(clhotel3$cluster, hotels$ETOILE)

res.pca3 <- PCA(datas3, graph = FALSE, quali.sup = c(1, 2), ind.sup = 1:6)
plot(res.pca3, habillage = 1, new.plot = FALSE, cex = 0.8)
plot(res.pca3, habillage = 2, new.plot = FALSE, cex = 0.8)
plot(res.pca3, axes = c(3, 4), habillage = 1, new.plot = FALSE, cex = 0.8)
plot(res.pca3, axes = c(3, 4), habillage = 2, new.plot = FALSE, cex = 0.8)

# La suite concerne l'etude d'un critere pr choisir un nombre de classes quand on lance un k-means. On en essaye en general plusieurs et on voit celui qui "colle le mieux".
library(vegan)
mat1 <- matrix(runif(100), 10, 10)
res1 <- cascadeKM(mat1, 2, 5, iter = 25, criterion = "calinski")
plot1 <- plot(res1)

vec2 <- sort(matrix(runif(30), 30, 1))
res2 <- cascadeKM(vec2, 2, 5, iter = 25, criterion = "calinski")
plot2 <- plot(res2)

vec3 <- sort(matrix(runif(1000), 1000, 1))
res3 <- cascadeKM(vec3, 2, 7, iter = 25, criterion = "calinski")
plot3 <- plot(res3, gridcol = NA)

## exo "AFC multiple" ##

Sexe <- rep(c("F", "M"), c(5, 5))
Revenu <- rep(c("M", "E", "M"), c(2, 5, 3))
Pref. <- c("A", "A", "B", "C", "C", "C", "B", "B", "B", "A")
Resultats <- data.frame(cbind(Sexe, Revenu, Pref.))
print(Resultats)

Resultats.cont <- as.data.frame(table(Resultats))

# A la main
burt <- function(table) {
 disj <- data.frame(lapply(1:ncol(table), function(i) {
  col <- table[, i]
  lev <- names(table)[i]
  n <- length(col)
  col <- as.factor(col)
  x <- matrix(0, n, length(levels(col)))
  x[(1:n) + n * (unclass(col) - 1)] <- 1
  dimnames(x) <- list(row.names(table), paste(lev, levels(col), sep = "."))
  return(x)
 }))
 burt <- as.matrix(t(disj)) %*% as.matrix(disj)
 burt <- data.frame(burt)
 names(burt) <- names(disj)
 row.names(burt) <- names(disj)
 return(burt)
}
Resultats.burt <- burt(Resultats))

# via une fonction déjà codée
library(ade4)
acm.burt(Resultats, Resultats)
acm.disjonctif(Resultats)

library(FactoMineR)
tab.disjonctif(Resultats)

mosaicplot(table(Resultats), main = "Mosaicplot de l'exemple")
library(vcd)
assoc(table(Resultats), main = "Assocplot de l'exemple")

Resultats.burt.acm <- CA(Resultats.burt, graph = F, ncp = 4)
Resultats.disj.acm <- CA(Resultats.disj, graph = F, ncp = 4)
Resultats.burt.acm$eig
Resultats.disj.acm$eig

plot(Resultats.burt.acm, title = "AFCM par AFC du tableau de Burt")
plot(Resultats.disj.acm, title = "AFCM par AFC du TDC")
plot(Resultats.disj.acm, title = "AFCM par AFC du TDC", invisible = "row")

multtest <- mapply(all.equal, (Resultats.burt.acm$eig$eig)[1:4], ((Resultats.disj.acm$eig$eig)[1:4])^2)
multiden <- sapply(multtest, identical, TRUE)
all(multiden)
# note: all.equal() permet des petites différences que == par exemple ne teloère pas ?!

rapcol <- Resultats.burt.acm$col$coord/Resultats.disj.acm$col$coord
sqvptdc <- sqrt((Resultats.disj.acm$eig$eig)[1:4])
rapcol %*% diag(1/sqvptdc)

Resultats.init.acm <- MCA(Resultats, graph = F, ncp = 4)
Resultats.init.acm$eig

equaltest <- function(xx1, xx2) {
 return(all(sapply(mapply(all.equal, xx1, xx2), identical, TRUE)))
}
vpburt <- (Resultats.burt.acm$eig$eig)[1:4]
vpdisj <- (Resultats.disj.acm$eig$eig)[1:4]
vpinit <- (Resultats.init.acm$eig$eig)[1:4]
equaltest(vpinit, vpburt)
equaltest(vpinit, vpdisj)
plot(Resultats.init.acm, title = "AFCM directe avec FactoMineR")
plot(Resultats.init.acm, title = "AFCM directe avec FactoMineR", invisible = "ind")
plot(Resultats.init.acm, title = "AFCM directe avec FactoMineR", new.plot = FALSE)

## exo admin fac ##

expand.dft <- function(x, na.strings = "NA", as.is = FALSE, dec = ".") {
  DF <- sapply(1:nrow(x), function(i) x[rep(i, each = x$Freq[i]),], simplify = FALSE)
  DF <- subset(do.call("rbind", DF), select = -Freq)
  for (i in 1:ncol(DF)) {
   DF[[i]] <- type.convert(as.character(DF[[i]]), na.strings = na.strings, as.is = as.is,dec = dec)
  }
DF
}

UCBA.df <- expand.dft(data.frame(UCBAdmissions))
head(UCBA.df)
tail(UCBA.df)
str(UCBA.df)
table(UCBA.df)