Cluster Análisis en la Encuesta

Análisis de agrupación (Cluster Analysis)

Selección de la base de datos de tiempo de estudio y tiempo en deporte

tiempos1 <- subset(encuesta, select = c(Tiempo.estudio,Tiempo.deporte))
row.names(tiempos1) <- encuesta$Encuesta
tiempos1 <- na.omit(tiempos1)

require(ggplot2)

## Loading required package: ggplot2

g1 <- ggplot(tiempos1, aes(Tiempo.estudio, Tiempo.deporte, 
                           label = row.names(tiempos1) ))
g1 + geom_point(size = 5)  +
     geom_text(vjust=0, hjust=-0.5, size = 4) +
  xlab("Tiempo dedicado al estudio") +
  ylab("Tiempo dedicado al deporte") +
  coord_fixed(ratio = 1) +
  theme(axis.title.y = element_text(size=18),
        axis.title.x = element_text(size=18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18))

Construcción del análisis aglomerativo.

cluster1 <- hclust(dist(tiempos1))

Estructura del resultado del cluster

str(cluster1)

## List of 7
##  $ merge      : int [1:71, 1:2] -5 -6 -7 -9 -57 -13 -51 -34 -62 -35 ...
##  $ height     : num [1:71] 0 0 0 0 0 0 0 0 0 0 ...
##  $ order      : int [1:72] 44 57 9 42 36 7 58 30 56 61 ...
##  $ labels     : chr [1:72] "1" "2" "3" "4" ...
##  $ method     : chr "complete"
##  $ call       : language hclust(d = dist(tiempos1))
##  $ dist.method: chr "euclidean"
##  - attr(*, "class")= chr "hclust"

Resultados de la agrupación

with(cluster1, cbind(merge,height))

##                  height
##  [1,]  -5 -52  0.000000
##  [2,]  -6 -60  0.000000
##  [3,]  -7 -58  0.000000
##  [4,]  -9 -42  0.000000
##  [5,] -57   4  0.000000
##  [6,] -13 -18  0.000000
##  [7,] -51   6  0.000000
##  [8,] -34 -55  0.000000
##  [9,] -62   8  0.000000
## [10,] -35 -68  0.000000
## [11,] -56 -61  0.000000
## [12,]  -4 -47  0.500000
## [13,]  -1 -71  1.000000
## [14,] -40  12  1.000000
## [15,] -36   3  1.000000
## [16,]  -8 -10  1.000000
## [17,] -19 -39  1.000000
## [18,] -24 -45  1.000000
## [19,] -26 -41  1.000000
## [20,] -30  11  1.000000
## [21,] -37  10  1.000000
## [22,] -27  13  1.414214
## [23,] -16   2  1.414214
## [24,] -38  22  2.000000
## [25,]   7  14  2.000000
## [26,] -64  16  2.000000
## [27,] -12 -22  2.000000
## [28,] -15 -69  2.000000
## [29,] -63  17  2.000000
## [30,] -21   9  2.000000
## [31,] -70  19  2.000000
## [32,] -54  21  2.000000
## [33,]  -2   1  2.236068
## [34,] -14  26  2.236068
## [35,] -59  32  2.236068
## [36,]  -3 -11  2.828427
## [37,]  25  28  2.828427
## [38,] -25 -46  2.828427
## [39,]   5  15  3.000000
## [40,] -23 -33  3.000000
## [41,] -53  18  3.000000
## [42,] -49  23  3.162278
## [43,] -44  39  3.605551
## [44,] -17  33  4.000000
## [45,] -20  31  4.000000
## [46,]  20  30  4.123106
## [47,]  24  42  4.242641
## [48,] -65 -66  4.472136
## [49,] -48  29  5.656854
## [50,] -67  47  5.830952
## [51,] -43  44  6.000000
## [52,]  38  49  6.000000
## [53,] -32  41  6.324555
## [54,]  34  36  6.708204
## [55,]  35  40  7.000000
## [56,]  43  46  7.211103
## [57,]  45  54  7.810250
## [58,] -28  55  8.602325
## [59,]  37  50  8.944272
## [60,] -72  52  8.944272
## [61,] -29  51 10.440307
## [62,] -31  27 12.000000
## [63,]  53  57 13.601471
## [64,]  58  59 14.866069
## [65,] -50  48 15.264338
## [66,]  56  63 17.888544
## [67,]  61  62 19.416488
## [68,]  60  64 21.095023
## [69,]  66  68 36.878178
## [70,]  65  67 37.363083
## [71,]  69  70 77.103826

Algunos casos

tiempos1[c(5,52,6,60,1,71),]

##    Tiempo.estudio Tiempo.deporte
## 5              48              4
## 53             48              4
## 6              15              5
## 61             15              5
## 1              14              2
## 73             15              2

Dendrograma

plot(cluster1)

Selección de grupos

tiempos1$grupo <- cutree(cluster1, 4)

Gráfica con cuatro grupos

require(ggplot2)
require(RColorBrewer)

## Loading required package: RColorBrewer

paletaCol <- brewer.pal(4,"Dark2")
g1 <- ggplot(tiempos1, aes(Tiempo.estudio, Tiempo.deporte, 
                           label = row.names(tiempos1),
                           col = factor(grupo)))
g1 + geom_point(size = 5)  +
     geom_text(vjust=0, hjust=-0.5, size = 4) +
    scale_colour_manual(values=paletaCol) +
  xlab("Tiempo dedicado al estudio") +
  ylab("Tiempo dedicado al deporte") +
  coord_fixed(ratio = 1) +
  theme(axis.title.y = element_text(size=18),
        axis.title.x = element_text(size=18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18))

Gráfica con las dimensiones re-escaladas

require(ggplot2)
g1 <- ggplot(tiempos1, aes(Tiempo.estudio, Tiempo.deporte, 
                           label = row.names(tiempos1) ))
g1 + geom_point(size = 5)  +
     geom_text(vjust=0, hjust=-0.5, size = 4) +
  xlab("Tiempo dedicado al estudio") +
  ylab("Tiempo dedicado al deporte") +
  theme(axis.title.y = element_text(size=18),
        axis.title.x = element_text(size=18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18))

Cluster con las variables estandarizadas.

cluster2 <- hclust(dist(scale(tiempos1)))

Dendrograma

plot(cluster2)

tiempos1$grupo <- cutree(cluster2, 6)

Gráfica con seis grupos (variables estandarizadas)

require(ggplot2)
require(RColorBrewer)
paletaCol <- brewer.pal(6,"Dark2")
g1 <- ggplot(tiempos1, aes(Tiempo.estudio, Tiempo.deporte, 
                           label = row.names(tiempos1),
                           col = factor(grupo)))
g1 + geom_point(size = 5)  +
     geom_text(vjust=0, hjust=-0.5, size = 4) +
  scale_colour_manual(values=paletaCol) +
  xlab("Tiempo dedicado al estudio") +
  ylab("Tiempo dedicado al deporte") +
  theme(axis.title.y = element_text(size=18),
        axis.title.x = element_text(size=18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18))

Agregar el grupo a la base de datos original

tiempos1$Encuesta <- row.names(tiempos1)
encuesta <- merge(encuesta,subset(tiempos1,select=c(Encuesta,grupo), by = Encuesta))

Relación de la agrupación con el rendimiento académico

require(ggplot2)
require(Hmisc)

## Loading required package: Hmisc
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

g2 <- ggplot(encuesta, aes(factor(grupo), PAPA))
g2 + geom_violin() +geom_boxplot(alpha=0.5)+
  stat_summary(fun.data = "mean_cl_boot", color = "red", size = 2) +
  xlab("Grupo") +
  ylab("Promedio Académico\nPonderado Acumulado") +
  theme(axis.title.y = element_text(size=18),
        axis.title.x = element_text(size=18),
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18))

## Warning: Removed 1 rows containing missing values (geom_segment).

Cluster Análisis en la Encuesta

Universidad Nacional de Colombia, Sede Medellín

Jueves, 8 de octrubre de 2015

Lectura de la base de datos

Análisis de agrupación (Cluster Analysis)