-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathathalAffy.R
More file actions
207 lines (157 loc) · 6.65 KB
/
athalAffy.R
File metadata and controls
207 lines (157 loc) · 6.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
library("ArrayExpress")
library(affy)
library(dplyr)
library(gcrma)
library(readr)
library(ggbiplot)
library(sva)
library(ggplot2)
library(exploBATCH)
library(arrayQualityMetrics)
#query
#ATsets = queryAE(species = "Arabidopsis+thaliana")
#keep ones with raw data
#ATsetsfiltered<-ATsets %>% filter(tolower(Species)=="arabidopsis thaliana") %>% filter(tolower(Raw)=="yes")
#read exps list downloaded from website
AT_AE_Experiments_181227_014344 <- read_delim("ArrayExpress-Experiments-181227-194246.tsv",
"\t", escape_double = FALSE, trim_ws = TRUE)
ATsetsfiltered<-AT_AE_Experiments_181227_014344 %>% filter(tolower(Organism)=="arabidopsis thaliana") %>% filter(`Raw Data`!="Data is not available") %>% filter(Assays >4)
ATsetsfiltered_10<-ATsetsfiltered%>% filter(Assays >9)
ATsetsfiltered_10_100<-ATsetsfiltered%>% filter(Assays >9) %>% filter(Assays <101)
sum(ATsetsfiltered_10_100$Assays)
ATsetsfiltered_10_50<-ATsetsfiltered%>% filter(Assays >9) %>% filter(Assays <51)
sum(ATsetsfiltered_10_50$Assays)
ATsetsfiltered_10_50_2010 <- ATsetsfiltered_10_50 %>% filter(`Release Date` >= as.Date("2010-01-01"))
sum(ATsetsfiltered_10_50_2010$Assays)
write.csv(ATsetsfiltered_10_50_2010$Accession,"ATsetsfiltered_10_50_2010_Exp.csv",row.names = F)
smallATset<-head(ATsetsfiltered)
#download all .cel files
failed<-c()
celData<-data.frame()
idlist<-smallATset$Accession
for(id in idlist){
print(id)
atSamp = getAE(id, type = "raw")
thisRawList<-atSamp$rawFiles
#if there are no .cel files
celInd<-grep("\\.CEL$",thisRawList,ignore.case = T)
if(length(celInd)<1){
print("Err")
failed<-c(failed,id)
}else{
celData<-bind_rows(celData,data.frame(Exp=rep(id,length(thisRawList)),Cel=thisRawList))
}
}
celData<-celData %>% mutate(batchId=as.numeric(factor(celData$Exp)))
#read all cel files downloaded
files = list.files(pattern = "\\.CEL$", full.names = TRUE,ignore.case = T)
affy.data = ReadAffy(filenames = files)
#apply gcrma
eset <- gcrma(affy.data)
thisPD<-pData(eset)
thisExp<-exprs(eset)
save.image("eset.RData")
dim(thisExp)
#log 2 normalize
thisExplog<-log2(thisExp)
dim(thisExplog)
#apply PCA to identify batch affects
tData<-t(thisExp) #changes dimention ???
dim(tData)
tDatalog<-t(thisExplog)
dim(tDatalog)
#
#filter to remove 0 variance cols (genes)
tDatalogF <- tDatalog[,apply(tDatalog, 2, var, na.rm=TRUE) != 0]
dim(tDatalogF)
#see dist of corr vals
corrmat<-cor(tDatalogF)
dim(corrmat)
corrVec<-data.frame(cors=as.vector(corrmat[upper.tri(corrmat,diag = F)]))
sd<-head(corrVec,1000000)
p<-ggplot(data = corrVec,aes(x=cors))+ geom_histogram(bins = 10,colour="black", fill="white")
pdf("plot_nobatchcorr.pdf")
print(p)
dev.off()
esetPCA<-prcomp(tDatalogF, scale. = TRUE)
ggbiplot(esetPCA, obs.scale = 1, var.scale = 1,
groups = factor(celData$Exp), ellipse = TRUE, circle = TRUE,varname.size=0,var.axes = F) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
#perform batch correction
batch = celData$Exp
# parametric adjustment
combat_edata1 = ComBat(dat=thisExp, batch=batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
dim(combat_edata1)
tDatalog<-t(log2(combat_edata1+1))
dim(tDatalog)
#filter to remove 0 variance cols (genes)
tDatalogF <- tDatalog[,apply(tDatalog, 2, var, na.rm=TRUE) != 0]
dim(tDatalogF)
esetPCA<-prcomp(tDatalogF, scale. = TRUE)
corrmat<-cor(tDatalog)
# non-parametric adjustment, mean-only version
combat_edata2 = ComBat(dat=thisExp, batch=batch, mod=NULL, par.prior=FALSE, mean.only=TRUE)
dim(combat_edata2)
tDatalog<-t(log2(combat_edata2))
dim(tDatalog)
#filter to remove 0 variance cols (genes)
tDatalogF <- tDatalog[,apply(tDatalog, 2, var, na.rm=TRUE) != 0]
dim(tDatalogF)
esetPCA<-prcomp(tDatalogF, scale. = TRUE)
#clean files
data(wine)
wine.pca <- prcomp(wine, scale. = TRUE)
ggbiplot(wine.pca, obs.scale = 1, var.scale = 1,
groups = wine.class, ellipse = TRUE, circle = TRUE,varname.size=0) +
scale_color_discrete(name = '') +
theme(legend.direction = 'horizontal', legend.position = 'top')
#explobatch
require(exploBATCH)
require(exploBATCHbreast) # the package with breast cancer dataset
data(Breast) # load the breast cancer data
data(batchBreast) # load the variable defining batches
expBATCH(
D=Breast, # the breast cancer expression data matrix
batchCL=batchBreast, # the variable identifying the three batches
Conf=NA, # no biological variable of interest in this example
mindim=2, # the minimum number of pPCs
maxdim=9, # the maximum number of pPCs, we don't want this argument to be large,
# otherwise it will slow down exploBATCH.
method="ppcca", # use correctBATCH as well as ComBat to correct batch effect
SDselect=2 # set SD at 2 to reduce computational time.
)
#Batch should be numeric
expBATCH(
D=tData, # the breast cancer expression data matrix
batchCL=as.numeric(factor(celData$Exp)), # the variable identifying the three batches
Conf=NA, # no biological variable of interest in this example
mindim=2, # the minimum number of pPCs
maxdim=9, # the maximum number of pPCs, we don't want this argument to be large,
# otherwise it will slow down exploBATCH.
method="ppcca", # use correctBATCH as well as ComBat to correct batch effect
SDselect=0 # set SD at 2 to reduce computational time.
)
#filter good files
celData <- read_delim("C:/Users/mrbai/Desktop/celData.tsv",
"\t", escape_double = FALSE, trim_ws = TRUE)
toKeepExp<-ATsetsfiltered_10$Accession
celData_toKeep<-celData[celData$Exp %in% toKeepExp,]
length(unique(celData_toKeep$Exp))
length(unique(toKeepExp))
goodfiles <- read_csv("C:/Users/mrbai/Desktop/goodfiles.csv")
goodfiles$x<-substr(goodfiles$x,3,nchar(goodfiles$x))
celData_toKeep<-celData_toKeep[celData_toKeep$Cel %in% goodfiles$x,]
write.csv(celData_toKeep,"celDataReduced.csv",row.names = F)
celData_toKeep$Cel<-paste("./",celData_toKeep$Cel,sep = "")
write.csv(celData_toKeep$Cel,"goodFilesReduced.csv",row.names = F)
#zip files coressponding to exps
zipList <- read_csv("zipList.txt", col_names = FALSE)
#filter zip list which matches accession in ATsetsfiltered_10_100
zipList$X1<-substr(zipList$X1,3,nchar(zipList$X1))
zipList$X2<-sapply(strsplit(as.character(zipList$X1), ".raw"), "[[", 1)
filterZipList<- zipList[zipList$X2 %in% ATsetsfiltered_10_100$Accession,]
write_tsv(filterZipList,"zipList_10_100.tsv")
length(unique(filterZipList$X1))
#final celdata for 10_100 excluding failed to download files
celData_10_100<- celData[celData$Exp %in% unique(filterZipList$X2),]