############################################# #Khattree, R. and Naik, D. N.(2000) #Correspondence analysis. #In Multivariate Data Reduction and Discrimination with SAS software (pp. 443--509) #SAS institute and John Wiley ############################################# #------------------------------------------# #Step1: 7.2 Correspondence Analysis (pp.444--449) #------------------------------------------# ##(a) 7.1式 a <- 4 b <- 3 P <- matrix(c(5, 10, 12, 10, 5, 30, 0, 1, 5, 10, 12, 0), nrow=a, ncol=b) * .01 P #7.1式 ##(b) row and col sums of P r <- matrix(rowSums(P)) c <- matrix(colSums(P)) Dr <- matrix(0, ncol=a, nrow=a) Dc <- matrix(0, ncol=b, nrow=b) diag(Dr) <- r diag(Dc) <- c #(c) rows profiles (7.2式) R <- solve(Dr) %*% P R #7.3式 rowSums(R) #sum of the elements of any row profile is one. ##(d) column profiles (7.2式) C <- solve(Dc) %*% t(P) C rowSums(C) ##(e) average row and colum profiles c t(r) %*% R #7.4式 r t(c) %*% C ##(f) quantity d^2 t(R[2,] - c) %*% solve(Dc) %*% (R[2,] - c) t(R[2,] - c) %*% (R[2,] - c) ##(g) singular value decomposition m <- min(c(a, b) - 1) #the rank of P k <- 2 T <- sqrt(solve(Dr)) %*% (P - r %*% t(c)) %*% sqrt(solve(Dc)) A <- sqrt(Dr) %*% svd(T)$u[, 1:k] B <- sqrt(Dc) %*% svd(T)$v[, 1:k] Lamda <- matrix(0, nrow=k, ncol=k) diag(Lamda) <- svd(T)$d[1:k] ##(h) principal coordinates F <- solve(Dr) %*% A %*% Lamda G <- solve(Dc) %*% B %*% Lamda ##(i) standard cordinates solve(Dc) %*% B #------------------------------------------# #Step2: 関数定義 #------------------------------------------# my.corresp <- function(tab, k=2){ a <- nrow(tab) b <- ncol(tab) P <- prop.table(tab) r <- matrix(rowSums(P)) c <- matrix(colSums(P)) Dr <- matrix(0, ncol=a, nrow=a) Dc <- matrix(0, ncol=b, nrow=b) diag(Dr) <- r diag(Dc) <- c #rows and colum profiles (7.2式) R <- solve(Dr) %*% P C <- solve(Dc) %*% t(P) #singular value decomposition m <- min(c(a, b) - 1) #the rank of P k <- k T <- sqrt(solve(Dr)) %*% (P - r %*% t(c)) %*% sqrt(solve(Dc)) A <- sqrt(Dr) %*% svd(T)$u[, 1:k] B <- sqrt(Dc) %*% svd(T)$v[, 1:k] Lamda <- matrix(0, nrow=k, ncol=k) lamda <- svd(T)$d[1:k] diag(Lamda) <- lamda ##principal coordinates F <- solve(Dr) %*% A %*% Lamda G <- solve(Dc) %*% B %*% Lamda ## standard cordinates Ga <- solve(Dc) %*% B ##chisq chisq <- sum(tab) * sum(lamda^2) df <- (a - 1) * (b - 1) p.value <- pchisq(chisq, df, lower.tail=F) ##命名 rowlab <- paste(names(attr(tab, "dimnames"))[1], rownames(tab), sep=":") collab <- paste(names(attr(tab, "dimnames"))[2], colnames(tab), sep=":") dimlab <- paste("Dim", 1:k, sep="") colnames(F) <- colnames(G) <- colnames(Ga) <- dimlab rownames(F) <- rowlab rownames(G) <- rownames(Ga) <- collab ##asymmetric plot plot(rbind(F, Ga), type="n", main="Asymmetric plot") lines(F, type="p", pch=1) text(F, rowlab, pos=3) lines(Ga, type="p", pch=2) text(Ga, collab, pos=2) ##symmetric plot X11() plot(rbind(F, G), type="n", main="Symmetric plot") lines(F, type="p", pch=1) text(F, rowlab, pos=3) lines(G, type="p", pch=2) text(G, collab, pos=2) output <- list(table=tab, row.profile=R, col.profile=C, row=F, col=G, cola = Ga, lamda=lamda, chisq=chisq, df=df, p.value=p.value) return(output) } #------------------------------------------# #Step3: Example 1 Analysis of Headache Types by Age Data (pp.449--456) #------------------------------------------# tenkai <- function(f) { nc <- ncol(f) nr <- nrow(f) list(x=rep(rep(1:nr, nc), f), y=rep(rep(1:nc, each=nr), f)) } ##(a) データセットの作成 ex1 <- as.table(matrix(c(77, 52, 20, 66, 82, 68, 31, 49, 37, 20, 44, 31, 27, 52, 25), ncol=3, byrow=T)) ex1 <- data.frame(tenkai(ex1)) ex1[, 1] <- factor(ex1[, 1], labels=c("0-19", "20-29", "30-39", "40-49", "50-59")) ex1[, 2] <- factor(ex1[, 2], labels=c("Ndiagno", "Tension", "Migraine")) names(ex1) <- c("age", "categories") ##(b) 分析 my.corresp(table(ex1)) graphics.off() ##(c) ade4 library(ade4) ex1 <- as.data.frame(table(ex1)[1:5, 1:3]) x <- dudi.coa(ex1, nf=2) 2 scatter.dudi(x) x$co x$li graphics.off() ##(d) FactoMineR library(FactoMineR) x <- CA(ex1) x$row x$col graphics.off() #------------------------------------------# #Step4: Example 2 Correspondence analysis of socioeconomic status by mental health of children data (pp.456--463) #------------------------------------------# ##(a) データセットの作成 ex2 <- as.table(matrix(c(64,94,58,46, 57,94,54,40, 57,105,65,60, 72,141,77,94, 36,97,54,78, 21,71,54,71), ncol=4, byrow=T)) ex2 <- data.frame(tenkai(ex2)) ex2[, 1] <- factor(ex2[, 1], labels=c("1","2","3","4","5","6")) ex2[, 2] <- factor(ex2[, 2], labels=c("Well", "Mild", "Moderate", "Impaired")) names(ex2) <- c("status", "mental") ##(b) 分析 my.corresp(table(ex2)) ##(c) ade4 ex2 <- as.data.frame(table(ex2)[1:6, 1:4]) x <- dudi.coa(ex2, nf=2) 2 scatter.dudi(x) x$co x$li graphics.off() ##(d) FactoMineR x <- CA(ex2, ncp=2) x$row x$col graphics.off() #------------------------------------------# #Step5: Example 3 Multiple correspondence analysis of rating of drug's efficacy: analegesic drugs data (pp.463--471) #------------------------------------------# ##(a) データセットの作成 ex3 <- as.table(matrix(c(5,1,10,8,6, 5,3,3,8,12, 10,6,12,3,0, 7,12,8,1,1), ncol=5, byrow=T)) ex3 <- data.frame(tenkai(ex3)) ex3[, 1] <- factor(ex3[, 1], labels=c("z100","ec4","c60","c15")) ex3[, 2] <- factor(ex3[, 2], labels=c("poor","fair","good","verygood","excellent")) names(ex3) <- c("drug", "efficacy") Z1 <- model.matrix(~drug - 1, data=ex3) Z2 <- model.matrix(~efficacy - 1, data=ex3) colnames(Z1) <- gsub("drug","", colnames(Z1)) colnames(Z2) <- gsub("efficacy","", colnames(Z2)) Z <- cbind(Z1, Z2) burt <- t(Z) %*% Z ##(b) ade4 x <- dudi.acm(ex3, nf=2, scann = FALSE) x$co boxplot(x) scatter(x) graphics.off() ##(c) FactoMineR x <- MCA(ex3, ncp=2) x$var$coord graphics.off()