-
Notifications
You must be signed in to change notification settings - Fork 1
/
7_lmmTreatment_responsive_genes_bsplinecluster.r
212 lines (185 loc) · 8.41 KB
/
7_lmmTreatment_responsive_genes_bsplinecluster.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
##################
#README
#The script takes BTH treatment-responsive gnees (additive genes removed, fdr<0.001)
#and performs b-spline fitting of rosette size MPH (y-axis) against expression MPH of each gene (x-axis).
#The spline coefficients are then used for K-means clustering
##################
#Config
library(cluster)
library(clValid)
library(splines)
library(scales)
wd<-"/ebio/abt6_projects7/SHB2/data/1_analysis/6_BTHdiffgenes/"
outbasename<-"run197_RowFilter_BTHlmm_"
outbasename1<-paste0(outbasename,"rmadd_fdr0.001_")
metafile<-"Run197_rmCMRTPHBS_rm2MSeqtkDupOL_corr.meta.txt"
logTPMfile<-"run197_BTHdiffgene_rowfiltered_logTPM.txt"
Treatgenelist<-"run197_RowFilter_BTHlmm_rmadd_SimpleModel_FixEffect.txt"
MPHidxfile<-"run197_BTHdiffgene_MetaRowIdx_forpairedMPHcalc_manualcorr.txt"
###################
#data input
setwd(wd)
meta<-read.table(metafile,header=T,sep="\t",row.names=1)
meta$Treatment<-factor(meta$Treatment,levels=c("Mock","BTH"))
meta$IsHybrid<-factor(meta$IsHybrid,levels=c("Inbred","Hybrid"))
logTPM<-read.table(logTPMfile,header=T,sep="\t",row.names=1)
Treatlmm_rmadd<-read.table(Treatgenelist,header=T,sep="\t",row.names = 1)
MPHidx<-read.table(MPHidxfile,header=T,sep="\t")
MPHidxM<-which(MPHidx$Treatment=="Mock") # these are just row numbers, used for indexing/sorting/spline fitting in the subsequent step
MPHidxB<-which(MPHidx$Treatment=="BTH")
############################################
#Subset by fdr<0.001
BTHrmadd0.001<-Treatlmm_rmadd[Treatlmm_rmadd$padjBTH_rmadd<0.001,] #8102 genes
logTPM_rmadd0.001<-logTPM[rownames(logTPM)%in%rownames(BTHrmadd0.001),]
#calculate MPH for each of the 8K+ genes in each trio, then perform spline clustering with either all 8k+ genes
scaleBTHgenes<-t(scale(t(logTPM_rmadd0.001)))
calc_trioxpnMPH<-function(idx,gene){
fxp<-gene[as.numeric(idx[7])]
mxp<-gene[as.numeric(idx[8])]
hxp<-gene[as.numeric(idx[9])]
mphxp<-(hxp-0.5*(fxp+mxp))
}
xpnMPH<-t(apply(scaleBTHgenes,1,function(x){apply(MPHidx,1,calc_trioxpnMPH,gene=x)}))#each gene correspond to one row, within which is the MPH for each trio*treatment*rep
#########
#construct b-spline on quantiles of x (expression value), then cluster the coefficients
#note that spline should be fitted for mock and BTH separately
splinefitM<-list()
splinefitB<-list()
splinecoef<-data.frame()
for(i in 1:nrow(xpnMPH)){
print(i)
gene<-rownames(xpnMPH)[i]
dfM<-data.frame("x"=xpnMPH[i,MPHidxM],y=MPHidx$MPHmm2[MPHidxM])
dfB<-data.frame("x"=xpnMPH[i,MPHidxB],y=MPHidx$MPHmm2[MPHidxB])
dfM<-dfM[order(dfM$x),]
dfB<-dfB[order(dfB$x),]
nsfitM<-ns(dfM$x,knots=quantile(dfM$x,probs=c(0.33,0.67)))
nsfitB<-ns(dfB$x,knots=quantile(dfB$x,probs=c(0.33,0.67)))
modelnsM<-lm(dfM$y~nsfitM)
modelnsB<-lm(dfB$y~nsfitB)
splinefitM[[i]]<-data.frame("x"=dfM$x,"spline"=fitted(modelnsM))
splinefitB[[i]]<-data.frame("x"=dfB$x,"spline"=fitted(modelnsB))
coef<-c(modelnsM$coefficients, modelnsB$coefficients)
splinecoef<-rbind(splinecoef,coef)
}
rownames(splinecoef)<-rownames(xpnMPH)
colnames(splinecoef)<-paste0(rep(c("Intercept","nsfit1","nsfit2","nsfit3"),2),rep(c(".M",".B"),each=4))
#output spline coefficient
write.table(splinecoef,paste0(outbasename1,"splinecoefficient.txt"),quote=F,sep="\t")
#########
#clustering
maxk=100
genlist<-splinecoef
set.seed(109)
#calculate elbow, silhouette, and dunn index
ds<-dist(genlist)
avg_elb.sil<-data.frame()
testclust<-c()
for(k in 2:maxk){
print(k)
km <- kmeans(genlist, centers = k, nstart=100,iter.max =3000000)
testclust<-cbind(testclust,km$cluster)
print("elbow")
perc_within<-km$tot.withinss/km$totss
print("silhouette")
ss <- silhouette(km$cluster, ds)
meanss<-mean(ss[, 3])
print("Dunn")
dunnIdx<-dunn(distance=ds,clusters=km$cluster)
avg_elb.sil<-rbind(avg_elb.sil,c(perc_within,meanss,dunnIdx))
}
# compute gap statistic
gap_stat <- clusGap(genlist, FUN = kmeans, nstart = 30, K.max = maxk, B = 50)
print(gap_stat, method = "firstmax")
pdf(paste0(outbasename1,"MPHpredspline_clust_OptimalK.pdf"),width=10,height=15)
par(mfrow=c(3,1))
plot(2:maxk, type='l', avg_elb.sil[,1], main="Elbow plot",xlab='Number of clusters', ylab='Total within sum of squares', frame=FALSE)
text(2:maxk,avg_elb.sil[,1],2:maxk,col="indianred4",cex=0.7,srt=45)
plot(2:maxk, type='l', avg_elb.sil[,2], main="Silhouette plot",xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)
text(2:maxk,avg_elb.sil[,2],2:maxk,col="indianred4",cex=0.7,srt=45)
plot(2:maxk, type='l',avg_elb.sil[,3], main="Dunn Index",xlab="Number of clusters",ylab="Dunn Index",frame=F)
text(2:maxk,avg_elb.sil[,3],2:maxk,col="indianred4",cex=0.7,srt=45)
plot(1:maxk,gap_stat$Tab[,3],type="b",main="Gap statistic",xlab="Number of clusters",ylab="Gap statistic",frame=FALSE,pch=20)
arrows(x0=1:maxk,y0=(gap_stat$Tab[,3]-gap_stat$Tab[,4]),x1=1:maxk, y1=(gap_stat$Tab[,3]+gap_stat$Tab[,4]),code=3,angle=90,length=0.05)
dev.off()
colnames(testclust)<-paste0("K",2:maxk)
write.table(testclust,file=paste0(outbasename1,"MPHpredsplinek2-200clustID.txt"),quote=F,sep="\t")
################
#find strategy to determine optimal cluster numbers
#histogram cluster size per division
pdf(paste0(outbasename1,"2-200clustsize.pdf"),width=50,height=25)
par(mfrow=c(5,10))
for(i in 1:ncol(testclust)){
k<-(i+1)
clustsz<-table(testclust[,i])
barplot(sort(clustsz),axes=F,main=paste0("K=",k),xlab="Cluster ID", ylab="No. cluster members",cex.main=2,cex.lab=2)
axis(side=2,las=2)#one tick mark per 5% of genes
for(quant in quantile(clustsz,probs=c(0.05,0.25,0.5,0.75,0.95))) {
abline(h=quant,col="indianred4",lwd=3,lty=3)
}
abline(h=ceiling(nrow(testclust)*0.05), col="purple",lwd=3)
text(0,ceiling(nrow(testclust)*0.05),"5%",pos=3)
abline(h=ceiling(nrow(testclust)*0.025), col="purple",lwd=3)
text(0,ceiling(nrow(testclust)*0.025),"2.5%",pos=3)
abline(h=ceiling(nrow(testclust)*0.01), col="purple",lwd=3)
text(0,ceiling(nrow(testclust)*0.01),"1%",pos=3)
}
dev.off()
#####
#plot results: cluster mean, individual cluster spline
#from the above result, picked local peaks of best clusters k=7,15,56,77
#prep for base spline model matrix. This is originated from values within the range of scaled expression MPH (-4,4)
rangeserie<-seq(-4,4,0.1)
nsrangeserie<-ns(rangeserie,knots=quantile(rangeserie,probs=c(0.33,0.67)))
rangeseriemat<-model.matrix(~nsrangeserie) #use this to get fitted average spline value
K<-c(7,15,56,77)
for(k in K){
print(k)
clustID<-testclust[,(k-1)]
print(table(clustID))
#get the mean coefficient for each cluster
ClustSplineCoef<-c()
for(clust in 1:k){
ClustSplineCoef<-rbind(ClustSplineCoef,apply(splinecoef[which(clustID==clust),],2,mean))
}
pdfname<-paste0(outbasename1,"bsplineK",k,"_MPHprediction.pdf")
pdf(pdfname,width=60,height=35)
par(mfrow=c(7,12)) #plot Mock and BTH side by side
for(clust in 1:k){
print(clust)
print("Mock")
#retrieve the average cluster coefficient
meanCoefM<-ClustSplineCoef[clust,1:4]
meanSplinefitM<-rangeseriemat%*%meanCoefM
plot(seq(-4,4,length=10),seq(-50,200,length=10),xlim=c(-4,4),ylim=c(-80,220),type="n",
xlab="expression MPH",ylab="rosette size MPH",main=paste0("Cluster", clust, " Mock n=", table(clustID)[clust]),
cex.main=1.8,cex.lab=1.5)
#first plot individual lines for each gene
for(i in which(clustID==clust)){
geneM<-splinefitM[[i]]
lines(geneM$x,geneM$spline,col=alpha("springgreen4",0.15),lwd=0.3)
}
abline(h=0,lty=2,lwd=3)
abline(v=0,lty=2,lwd=3)
lines(rangeserie,meanSplinefitM,col="springgreen4",lwd=4)
print("BTH")
#retrieve the average cluster coefficient
meanCoefB<-ClustSplineCoef[clust,5:8]
meanSplinefitB<-rangeseriemat%*%meanCoefB
# plot(seq(-4,4,length=10),seq(-50,200,length=10),xlim=c(-4,4),ylim=c(-80,220),type="n",
# xlab="expression MPH",ylab="rosette size MPH",main=paste0("Cluster", clust, " BTH n=", table(clustID)[clust]))
#first plot individual lines for each gene
for(i in which(clustID==clust)){
geneB<-splinefitB[[i]]
lines(geneB$x,geneB$spline,col=alpha("darkorange2",0.15),lwd=0.3)
}
abline(h=0,lty=2,lwd=3)
abline(v=0,lty=2,lwd=3)
lines(rangeserie,meanSplinefitB,col="darkorange2",lwd=4)
}
dev.off()
}
################
#of course the more cluster there are, the more refined each cluster is. But the indi cluster size suffer
#use medium-level cluster numbers and make super cluster would be a good idea
#genes from the "flat" clusters are removed, then clustering was ran again on the remaining genes.