Análisis factorial por componentes principales en R

library(ggplot2)
library(readxl)
library(plotly)
library(ggrepel)
library(knitr)
library(kableExtra)
library(reshape2)
library(PerformanceAnalytics)
library(psych)
library(ltm)
library(Hmisc)
library(dplyr)
library(tidyverse)

Carga de base de datos

NotasFatorial <- read_excel("notas_fatorial.xlsx")

Visualización de base de datos

NotasFatorial[1:6, 1:5] %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped", 
                full_width = FALSE,
                font_size = 14)

estudante	finanças	custos	marketing	atuária
Gabriela	5.8	4	1.0	6.0
Luiz Felipe	3.1	3	10.0	2.0
Patrícia	3.1	4	4.0	4.0
Gustavo	10.0	8	8.0	8.0
Letícia	3.4	2	3.2	3.2
Ovídio	10.0	10	1.0	10.0

Estatísticas descritivas

summary(NotasFatorial)

  estudante            finanças          custos         marketing     
 Length:100         Min.   : 0.600   Min.   : 1.900   Min.   : 1.000  
 Class :character   1st Qu.: 3.100   1st Qu.: 2.900   1st Qu.: 3.000  
 Mode  :character   Median : 5.800   Median : 4.000   Median : 6.000  
                    Mean   : 5.834   Mean   : 4.717   Mean   : 5.668  
                    3rd Qu.: 9.000   3rd Qu.: 6.000   3rd Qu.: 8.000  
                    Max.   :10.000   Max.   :10.000   Max.   :10.000  
    atuária      
 Min.   : 1.700  
 1st Qu.: 3.200  
 Median : 5.000  
 Mean   : 5.314  
 3rd Qu.: 7.025  
 Max.   :10.000

Dispersión y ajuste lineal entre las variables ‘custos’ y ‘finanças’

NotasFatorial %>%
  ggplot() +
  geom_point(aes(x = finanças, y = custos),
             color = "darkorchid",
             size = 3) +
  geom_smooth(aes(x = finanças, y = custos),
              color = "orange", 
              method = "lm", 
              formula = y ~ x, 
              se = FALSE,
              size = 1.3) +
  labs(x = "Finanças",
       y = "Custos") +
  theme_bw()

Dispersión y ajuste lineal entre variables ‘custos’ y ‘marketing’

NotasFatorial %>%
  ggplot() +
  geom_point(aes(x = marketing, y = custos),
             color = "darkorchid",
             size = 3) +
  geom_smooth(aes(x = marketing, y = custos),
              color = "orange", 
              method = "lm", 
              formula = y ~ x, 
              se = FALSE,
              size = 1.3) +
  labs(x = "Marketing",
       y = "Custos") +
  theme_bw()

Dispersión y ajuste lineal entre variables ‘custos’ y ‘atuária’

NotasFatorial %>%
  ggplot() +
  geom_point(aes(x = atuária, y = custos),
             color = "darkorchid",
             size = 3) +
  geom_smooth(aes(x = atuária, y = custos),
              color = "orange", 
              method = "lm", 
              formula = y ~ x, 
              se = FALSE,
              size = 1.3) +
  labs(x = "Atuária",
       y = "Custos") +
  theme_bw()

Coeficientes de correlación de Pearson para cada par de variables

rho <- rcorr(as.matrix(NotasFatorial[,2:5]), type="pearson")

corr_coef <- rho$r # Matriz de correlación
corr_sig <- round(rho$P, 5) # Matriz con p-valor de coeficientes

Elaboración de un mapa de calor de las correlaciones de Pearson entre las variables

ggplotly(
  NotasFatorial[,2:5] %>%
    cor() %>%
    melt() %>%
    rename(Correlação = value) %>%
    ggplot() +
    geom_tile(aes(x = Var1, y = Var2, fill = Correlação)) +
    geom_text(aes(x = Var1, y = Var2, label = format(Correlação, digits = 1)),
              size = 5) +
    scale_fill_viridis_b() +
    labs(x = NULL, y = NULL) +
    theme_bw())

Visualización de distribuciones variables, dispersiones, valores de correlación.

chart.Correlation(NotasFatorial[, 2:5], histogram = TRUE, pch = "+")

Elaboración de Análisis Factorial por Componentes Principales

Prueba de esfericidad de Bartlett

cortest.bartlett(NotasFatorial[, 2:5])

R was not square, finding R from data

$chisq
[1] 191.8791

$p.value
[1] 1.013914e-38

$df
[1] 6

Elaboración de análisis factorial por componentes principales

fatorial <- principal(NotasFatorial[, 2:5],
                      nfactors = length(NotasFatorial[, 2:5]),
                      rotate = "none",
                      scores = TRUE)
fatorial

Principal Components Analysis
Call: principal(r = NotasFatorial[, 2:5], nfactors = length(NotasFatorial[, 
    2:5]), rotate = "none", scores = TRUE)
Standardized loadings (pattern matrix) based upon correlation matrix
            PC1   PC2   PC3   PC4 h2       u2 com
finanças   0.90  0.01  0.44  0.09  1 -6.7e-16 1.5
custos     0.93  0.05 -0.12 -0.33  1 -1.1e-15 1.3
marketing -0.04  1.00  0.00  0.02  1  1.1e-16 1.0
atuária    0.92 -0.01 -0.30  0.26  1  0.0e+00 1.4

                       PC1  PC2  PC3  PC4
SS loadings           2.52 1.00 0.30 0.18
Proportion Var        0.63 0.25 0.07 0.05
Cumulative Var        0.63 0.88 0.95 1.00
Proportion Explained  0.63 0.25 0.07 0.05
Cumulative Proportion 0.63 0.88 0.95 1.00

Mean item complexity =  1.3
Test of the hypothesis that 4 components are sufficient.

The root mean square of the residuals (RMSR) is  0 
 with the empirical chi square  0  with prob <  NA 

Fit based upon off diagonal values = 1

Valores propios

eigenvalues <- round(fatorial$values, 5)
eigenvalues

[1] 2.51813 1.00038 0.29762 0.18388

Suma de valores propios = 4 (número de variables en el análisis)

También representa la cantidad máxima de factores posibles en el análisis.

round(sum(eigenvalues), 2)

[1] 4

Identificación de la varianza compartida en cada factor

variancia_compartilhada <- as.data.frame(fatorial$Vaccounted) %>% 
  slice(1:3)

rownames(variancia_compartilhada) <- c("Autovalores",
                                       "Prop. da Variância",
                                       "Prop. da Variância Acumulada")

Varianza compartida por las variables originales para la formación de cada factor

round(variancia_compartilhada, 3) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped", 
                full_width = FALSE, 
                font_size = 14)

	PC1	PC2	PC3	PC4
Autovalores	2.518	1.00	0.298	0.184
Prop. da Variância	0.630	0.25	0.074	0.046
Prop. da Variância Acumulada	0.630	0.88	0.954	1.000

Cálculo de puntajes factoriales

scores_fatoriais <- as.data.frame(fatorial$weights)

Visualización de puntajes factoriales

round(scores_fatoriais, 3) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped", 
                full_width = FALSE, 
                font_size = 14)

	PC1	PC2	PC3	PC4
finanças	0.356	0.007	1.467	0.471
custos	0.371	0.049	-0.402	-1.811
marketing	-0.017	0.999	-0.001	0.099
atuária	0.364	-0.010	-1.022	1.389

Cálculo de los propios factores

fatores <- as.data.frame(fatorial$scores)

View(fatores)

Coeficientes de correlación de Pearson para cada par de factores (ortogonal)

rho <- rcorr(as.matrix(fatores), type="pearson")
round(rho$r, 4)

    PC1 PC2 PC3 PC4
PC1   1   0   0   0
PC2   0   1   0   0
PC3   0   0   1   0
PC4   0   0   0   1

Cálculo de cargas factoriales

cargas_fatoriais <- as.data.frame(unclass(fatorial$loadings))

Visualización de cargas factoriales

round(cargas_fatoriais, 3) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped", 
                full_width = FALSE, 
                font_size = 14)

	PC1	PC2	PC3	PC4
finanças	0.895	0.007	0.437	0.087
custos	0.934	0.049	-0.120	-0.333
marketing	-0.042	0.999	0.000	0.018
atuária	0.918	-0.010	-0.304	0.255

Cálculo de puntos en común

comunalidades <- as.data.frame(unclass(fatorial$communality)) %>%
  rename(comunalidades = 1)

Visualización de puntos en común (aquí son iguales a 1 para todas las variables)

4 factores fueron extraídos en este primer momento

round(comunalidades, 3) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                full_width = FALSE,
                font_size = 14)

	comunalidades
finanças	1
custos	1
marketing	1
atuária	1

Elaboración del análisis factorial por componentes principales

Factores extraídos de valores propios mayores que 1

Definición del número de factores con autovalores mayores a 1

k <- sum(eigenvalues > 1)
print(k)

[1] 2

Elaboración de análisis factorial por componentes principales

Con ‘k’ número de factores con valores propios mayores que 1

fatorial2 <- principal(NotasFatorial[, 2:5],
                      nfactors = k,
                      rotate = "none",
                      scores = TRUE)
fatorial2

Principal Components Analysis
Call: principal(r = NotasFatorial[, 2:5], nfactors = k, rotate = "none", 
    scores = TRUE)
Standardized loadings (pattern matrix) based upon correlation matrix
            PC1   PC2   h2      u2 com
finanças   0.90  0.01 0.80 0.19821   1
custos     0.93  0.05 0.87 0.12522   1
marketing -0.04  1.00 1.00 0.00033   1
atuária    0.92 -0.01 0.84 0.15773   1

                       PC1  PC2
SS loadings           2.52 1.00
Proportion Var        0.63 0.25
Cumulative Var        0.63 0.88
Proportion Explained  0.72 0.28
Cumulative Proportion 0.72 1.00

Mean item complexity =  1
Test of the hypothesis that 2 components are sufficient.

The root mean square of the residuals (RMSR) is  0.06 
 with the empirical chi square  4.25  with prob <  NA 

Fit based upon off diagonal values = 0.99

Cálculo de puntos en común con solo los primeros factores ‘k’ (‘k’ = 2)

comunalidades2 <- as.data.frame(unclass(fatorial2$communality)) %>%
  rename(comunalidades = 1)

Visualización de puntos en común con solo los primeros factores ‘k’ (‘k’ = 2)

round(comunalidades2, 3) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                full_width = FALSE,
                font_size = 14)

	comunalidades
finanças	0.802
custos	0.875
marketing	1.000
atuária	0.842

Diagrama de carga con las cargas de los primeros factores ‘k’ (‘k’ = 2)

cargas_fatoriais[, 1:2] %>% 
  data.frame() %>%
  rownames_to_column("variáveis") %>%
  ggplot(aes(x = PC1, y = PC2, label = variáveis)) +
  geom_point(color = "darkorchid",
             size = 3) +
  geom_text_repel() +
  geom_vline(aes(xintercept = 0), linetype = "dashed", color = "orange") +
  geom_hline(aes(yintercept = 0), linetype = "dashed", color = "orange") +
  expand_limits(x= c(-1.25, 0.25), y=c(-0.25, 1)) +
  theme_bw()

Agregar los factores extraídos a la base de datos original

NotasFatorial <- bind_cols(NotasFatorial,
                           "fator 1" = fatores$PC1, 
                           "fator 2" = fatores$PC2)

Creación de un ranking Criterio de suma ponderada y ranking)

NotasFatorial$ranking <- fatores$PC1 * variancia_compartilhada$PC1[2] +
                         fatores$PC2 * variancia_compartilhada$PC2[2]

Visualización de la clasificación final

NotasFatorial[1:6, 1:8] %>%
  arrange(desc(ranking)) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                full_width = FALSE,
                font_size = 14)

estudante	finanças	custos	marketing	atuária	fator 1	fator 2	ranking
Gustavo	10.0	8	8.0	8.0	1.3458058	0.8868535	1.0690249
Ovídio	10.0	10	1.0	10.0	1.9796313	-1.5530229	0.8578377
Luiz Felipe	3.1	3	10.0	2.0	-1.0770424	1.5026257	-0.3022334
Gabriela	5.8	4	1.0	6.0	0.0151560	-1.6650640	-0.4068827
Patrícia	3.1	4	4.0	4.0	-0.6002072	-0.6039721	-0.5288997
Letícia	3.4	2	3.2	3.2	-0.9793448	-0.9220541	-0.8471296

Análisis factorial por componentes principales en R

Análisis factorial PCA

Josue Huaman

2022-12-07

Instalación y carga de librerias

Elaboración de Análisis Factorial por Componentes Principales

Suma de valores propios = 4 (número de variables en el análisis)

Visualización de puntos en común (aquí son iguales a 1 para todas las variables)

Elaboración del análisis factorial por componentes principales

Visualización de la clasificación final