https://www.rdocumentation.org/packages/datasets/versions/3.6.2/topics/USArrests
library(cluster)
library(ggplot2)
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(GGally)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
library(fossil)
Loading required package: sp
Loading required package: maps
Attaching package: ‘maps’
The following object is masked from ‘package:cluster’:
votes.repub
Loading required package: shapefiles
Loading required package: foreign
Attaching package: ‘shapefiles’
The following objects are masked from ‘package:foreign’:
read.dbf, write.dbf
head(USArrests)
summary(USArrests)
Murder Assault UrbanPop Rape
Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
Median : 7.250 Median :159.0 Median :66.00 Median :20.10
Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
df <- scale(USArrests)
summary(df)
Murder Assault UrbanPop Rape
Min. :-1.6044 Min. :-1.5090 Min. :-2.31714 Min. :-1.4874
1st Qu.:-0.8525 1st Qu.:-0.7411 1st Qu.:-0.76271 1st Qu.:-0.6574
Median :-0.1235 Median :-0.1411 Median : 0.03178 Median :-0.1209
Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
3rd Qu.: 0.7949 3rd Qu.: 0.9388 3rd Qu.: 0.84354 3rd Qu.: 0.5277
Max. : 2.2069 Max. : 1.9948 Max. : 1.75892 Max. : 2.6444
dist(df, method = "euclidean")
Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware Florida Georgia
Alaska 2.7037541
Arizona 2.2935197 2.7006429
Arkansas 1.2898102 2.8260386 2.7177583
California 3.2631104 3.0125415 1.3104842 3.7636409
Colorado 2.6510673 2.3265187 1.3650307 2.8310512 1.2876185
Connecticut 3.2152975 4.7399125 3.2628575 2.6076395 4.0663898 3.3279920
Delaware 2.0192927 3.6213633 1.9093696 1.8003239 3.0737852 2.5547456 1.7568475
Florida 2.2981353 2.9967642 1.7493928 3.3721968 2.0250039 2.4458600 4.4700701 3.0614170
Georgia 1.1314351 2.8194388 2.7871963 2.2117614 3.3780585 2.8649105 3.9738227 2.9838715 2.1812958
Hawaii 3.3885300 4.5301340 3.2621208 2.9723097 3.6589083 2.8233524 1.3843291 2.4748807 4.3596338 3.8105218
Idaho 2.9146623 4.0580555 3.5210071 1.7687255 4.4879436 3.4767685 1.6354214 2.0382540 4.6999827 3.8005715
Illinois 1.8734993 3.2670626 1.0825512 2.4626424 1.9117469 1.7898322 2.7400560 1.5584719 1.7711863 2.3135778
Indiana 2.0761411 3.3655952 2.6407486 1.4450503 3.4061273 2.3655622 1.6147898 1.6973340 3.6150778 2.6924143
Iowa 3.4878952 4.7251910 4.1157513 2.4252661 4.9708591 3.9406898 1.5470089 2.6068606 5.2682765 4.2517889
Kansas 2.2941096 3.6808173 2.7762838 1.5718411 3.6071725 2.6272281 1.2280424 1.5510864 3.8424558 3.0071474
Kentucky 1.8475879 3.5440903 3.3567681 1.0598104 4.2463809 3.2274013 2.3346386 2.2514939 3.9474983 2.4408198
Louisiana 0.7722224 2.9631431 2.2178519 2.0254276 3.0176625 2.6546743 3.5329409 2.3266996 1.7529677 0.8592544
Maine 3.4851115 4.8322605 4.2961903 2.3621893 5.2699843 4.2713441 1.8792141 2.6560808 5.3946798 4.3334217
Maryland 1.2896460 2.2777590 1.2117356 2.0582244 2.2312581 1.9667562 3.4968269 1.9624834 1.4355204 1.8388691
Massachusetts 2.9874810 4.3729925 2.5162281 2.6881270 3.2156499 2.6522793 0.9468199 1.4382527 3.7753087 3.6706708
Hawaii Idaho Illinois Indiana Iowa Kansas Kentucky Louisiana Maine Maryland
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
Florida
Georgia
Hawaii
Idaho 2.3658101
Illinois 2.7329756 3.2728945
Indiana 1.5460727 1.4923351 2.2027081
Iowa 2.1564575 0.8584962 3.7380070 1.7786548
Kansas 1.4648766 1.2103118 2.3228505 0.4287712 1.4699265
Kentucky 2.5203345 1.6565236 2.8478883 1.1790552 1.9426473 1.3020180
Louisiana 3.5687157 3.5283772 1.6535178 2.4957547 4.0359614 2.7284126 2.4221964
Maine 2.7160558 0.8486112 3.9342034 2.1029158 0.6457158 1.7913753 1.9925855 4.0901924
Maryland 3.6148670 3.4014584 1.3429997 2.5430878 4.0642448 2.7400943 2.8229479 1.2739137 4.1259083
Massachusetts 1.3276676 2.2201020 2.0080982 1.6615695 2.3510287 1.4343401 2.6284451 3.1524549 2.6920282 2.9743193
Massachusetts Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada New Hampshire
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
Florida
Georgia
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
New Jersey New Mexico New York North Carolina North Dakota Ohio Oklahoma Oregon Pennsylvania
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
Florida
Georgia
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Rhode Island South Carolina South Dakota Tennessee Texas Utah Vermont Virginia Washington
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
Florida
Georgia
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
West Virginia Wisconsin
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
Florida
Georgia
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
[ reached getOption("max.print") -- omitted 29 rows ]
wardhclust <- hclust(dist(df, method = "euclidean"), method = "ward.D2")
plot(wardhclust)
fviz_dend(wardhclust, k = 4, rect = TRUE)
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
completehclust <- hclust(dist(df, method = "euclidean"), method = "complete")
plot(completehclust)
fviz_dend(completehclust, k = 4, rect = TRUE)
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
fviz_nbclust(df, kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2) + geom_vline(xintercept = 2, linetype = 2)
fviz_nbclust(df, kmeans, method = "silhouette")
kmeans(df, 4)
K-means clustering with 4 clusters of sizes 8, 13, 16, 13
Cluster means:
Murder Assault UrbanPop Rape
1 1.4118898 0.8743346 -0.8145211 0.01927104
2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
3 -0.4894375 -0.3826001 0.5758298 -0.26165379
4 0.6950701 1.0394414 0.7226370 1.27693964
Clustering vector:
Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware
1 4 4 1 4 4 3 3
Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
4 1 3 2 4 3 2 3
Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota Mississippi
2 1 2 4 3 4 2 1
Missouri Montana Nebraska Nevada New Hampshire New Jersey New Mexico New York
4 2 2 4 2 3 4 4
North Carolina North Dakota Ohio Oklahoma Oregon Pennsylvania Rhode Island South Carolina
1 2 3 3 3 3 3 1
South Dakota Tennessee Texas Utah Vermont Virginia Washington West Virginia
2 1 4 3 2 3 3 2
Wisconsin Wyoming
2 3
Within cluster sum of squares by cluster:
[1] 8.316061 11.952463 16.212213 19.922437
(between_SS / total_SS = 71.2 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
fviz_cluster(kmeans(df, 4), data = df, ellipse.type = "euclid", star.plot = TRUE, repel = TRUE)
ggpairs(USArrests)
plot: [1,1] [=====>---------------------------------------------------------------------------------------] 6% est: 0s
plot: [1,2] [===========>---------------------------------------------------------------------------------] 12% est: 0s
plot: [1,3] [================>----------------------------------------------------------------------------] 19% est: 0s
plot: [1,4] [======================>----------------------------------------------------------------------] 25% est: 0s
plot: [2,1] [============================>----------------------------------------------------------------] 31% est: 0s
plot: [2,2] [==================================>----------------------------------------------------------] 38% est: 0s
plot: [2,3] [========================================>----------------------------------------------------] 44% est: 0s
plot: [2,4] [=============================================>-----------------------------------------------] 50% est: 0s
plot: [3,1] [===================================================>-----------------------------------------] 56% est: 0s
plot: [3,2] [=========================================================>-----------------------------------] 62% est: 0s
plot: [3,3] [===============================================================>-----------------------------] 69% est: 0s
plot: [3,4] [=====================================================================>-----------------------] 75% est: 0s
plot: [4,1] [===========================================================================>-----------------] 81% est: 0s
plot: [4,2] [================================================================================>------------] 88% est: 0s
plot: [4,3] [======================================================================================>------] 94% est: 0s
plot: [4,4] [=============================================================================================]100% est: 0s
prcomp(df, center=TRUE)
Standard deviations (1, .., p=4):
[1] 1.5748783 0.9948694 0.5971291 0.4164494
Rotation (n x k) = (4 x 4):
PC1 PC2 PC3 PC4
Murder -0.5358995 0.4181809 -0.3412327 0.64922780
Assault -0.5831836 0.1879856 -0.2681484 -0.74340748
UrbanPop -0.2781909 -0.8728062 -0.3780158 0.13387773
Rape -0.5434321 -0.1673186 0.8177779 0.08902432
summary(prcomp(df, center=TRUE))
Importance of components:
PC1 PC2 PC3 PC4
Standard deviation 1.5749 0.9949 0.59713 0.41645
Proportion of Variance 0.6201 0.2474 0.08914 0.04336
Cumulative Proportion 0.6201 0.8675 0.95664 1.00000
fviz_eig(prcomp(df, center=TRUE), addlabels = TRUE)
rand.index(cutree(wardhclust, k = 4), kmeans(df, 4)$cluster)
[1] 0.8440816