forked from burakbayramli/books
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pca.R
147 lines (121 loc) · 5.11 KB
/
pca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
######### First Example ##############
##### The American States in 1977 ####
# As done in class
# The data file is built in to R
# Variables for the 50 states
summary(state.x77)
# Where R thinks the states are located
plot(state.center,type="n")
text(state.center,state.abb)
# First try at PCA of the states
state.pca1 <- prcomp(state.x77)
# Output summary:
print(state.pca1,digits=3)
# Scree plot
plot(state.pca1,type="l")
# Projection/biplot
biplot(state.pca1,cex=(0.75,1))
# Standard deviations of the variables
apply(state.x77,2,sd)
# Second try at PCA of the states
state.pca <- prcomp(state.x77,scale.=TRUE)
# Biplot: projections of states plus how the variables relate to the components
biplot(state.pca,cex=c(0.5,0.75))
# What are the first three components?
signif(state.pca$rotation[,1:3],3)
# Scree plot
plot(state.pca,type="l")
# function to plot the state abbrevations in position, with scaled sizes
# Linearly scale the sizes from the given minimum to the maximum
# Inputs: vector of raw numbers, minimum size for plot,
# maximum size
# Outputs: Rescaled sizes (invisible)
plot.states_scaled <- function(sizes,min.size=0.4,max.size=2,...) {
out.range = max.size - min.size
in.range = max(sizes)-min(sizes)
scaled.sizes = out.range*((sizes-min(sizes))/in.range)
sizes = scaled.sizes + min.size
plot(state.center,type="n",...)
text(state.center,state.abb,cex=sizes)
invisible(sizes)
}
# PC1 plotted geographically
# Arguably it's "Southerness"
plot.states_scaled(state.pca$x[,1],min.size=0.3,max.size=1.5,
xlab="longitude",ylab="latitude")
############# Second Example ##############
########### NY Times stories ##############
# New stories randomly selected from the New York Times Annotated Corpus
# 57 stories about art and 45 about music
# Load the stories from a saved workspace
# Modify the location to be wherever you download the workspace
load("~/teaching/402/lectures/17-pca/pca-examples.Rdata")
# The workspace now contains:
# nyt.frame.raw: a data frame with counts of words (columns) in stories (rows)
# first column, "class.labels", is a factor indicating "art"
# or "music"
# nyt.frame: the same, with word counts suitably normalized and weighted
# art: vector where each row is itself a vector of words giving the
# actual stories about art, with punctuation removed, etc.
# music: ditto
# Some miscellaneous functions used to create the data sets (see end of
# this file for gory details)
# We'll work with nyt.frame
# How big is it?
dim(nyt.frame)
# Remember: rows = stories, columns = words (except the first column, which
# is the type of story)
# What are some typical words?
colnames(nyt.frame)[sample(ncol(nyt.frame),30)]
# A little bit of the data
signif(nyt.frame[sample(nrow(nyt.frame),5),sample(ncol(nyt.frame),10)],3)
# What you should see: lots of zeroes! Most words do not appear in most stories.
# Harder to see: strong correlations between words which do appear.
# Do PCA
nyt.pca = prcomp(nyt.frame[,-1])
# Omit the first column of class labels
# Extract the actual component directions/weights for ease of reference
nyt.latent.sem = nyt.pca$rotation
# What are the components?
# Show the 30 words with the biggest positive loading on PC1
signif(sort(nyt.latent.sem[,1],decreasing=TRUE)[1:30],2)
# biggest negative loading on PC1, the other end of that scale
signif(sort(nyt.latent.sem[,1],decreasing=FALSE)[1:30],2)
# Ditto for PC 2
signif(sort(nyt.latent.sem[,2],decreasing=TRUE)[1:30],2)
signif(sort(nyt.latent.sem[,2],decreasing=FALSE)[1:30],2)
# Plot the projection of the stories on to the first 2 components
# Establish the plot window
plot(nyt.pca$x[,1:2],type="n")
# Arts stories with red As
points(nyt.pca$x[nyt.frame[,"class.labels"]=="art",1:2],pch="A",col="red")
# Music stories with blue Ms
points(nyt.pca$x[nyt.frame[,"class.labels"]=="music",1:2],pch="M",col="blue")
# The separation is very good, even with only two components.
####################### The End ##################
# Actual creation of the pca-examples.Rdata file
# Not much point to this unless you're me
# The files referred to here are at www.stat.cmu.edu/~cshalizi/350/
# The NYT corpus is in XML
library(XML)
# Load my functions
source("~/teaching/350/hw/01/01.R")
# Load in the stories
art <- read.directory("~/teaching/350/hw/01/nyt_corpus/art")
music <- read.directory("~/teaching/350/hw/01/nyt_corpus/music")
# Turn each story into a bag of words
art.bow <- lapply(art,table)
music.bow <- lapply(music,table)
# Make a single data frame by combining the bags of words
nyt.frame.raw <- make.BoW.frame(c(art.bow,music.bow))
# Weight by inverse document frequency
nyt.frame <- idf.weight(nyt.frame.raw)
# Normalize by vector length
nyt.frame <- div.by.euc.length(nyt.frame)
# Add class labels
class.labels <- c(rep("art",57),rep("music",45))
nyt.frame.raw <- data.frame(class.labels=as.factor(class.labels),nyt.frame.raw)
nyt.frame <- data.frame(class.labels=as.factor(class.labels),nyt.frame)
# the normalizing and weighting functions don't work well with non-
# numeric columns so it's simpler to add the labels at the end
# Now save the workspace...