-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmergeidf2.R
More file actions
177 lines (124 loc) · 4.57 KB
/
mergeidf2.R
File metadata and controls
177 lines (124 loc) · 4.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
library(readr)
library(magrittr)
library(dplyr)
library(purrr)
library(stringr)
makeasChar<-function(listOfDF){
t<-length(listOfDF)
for(i in 1:t){
listOfDF[[i]]<- as.data.frame(listOfDF[[i]] %>% mutate_all(as.character), stringsAsFactors=FALSE)
tfile<- listOfDF[[i]]
tfile[is.na(tfile)]<-"NA"
tfile<-tfile[,1:2]
ttfile<-data.frame(t(tfile))
colnames(ttfile)<- as.character(unlist(ttfile[1,]))
ttfile<-ttfile[-1,]
listOfDF[[i]]<-ttfile
}
return(listOfDF)
}
cleanSdrf<-function(df){
df<-as.data.frame(df %>% mutate_all(as.character), stringsAsFactors=FALSE)
tfile<- df
tfile[is.na(tfile)]<-"NA"
tfile<-tfile[,1:2]
ttfile<-data.frame(t(tfile))
colnames(ttfile)<- as.character(unlist(ttfile[1,]))
ttfile<-ttfile[-1,]
return(ttfile)
}
#megre sdrf files downloaded from arrayexpress
#df_idf<-list.files(full.names = TRUE,path = "idf/", pattern = "*.txt") %>% lapply(read_tsv,col_names = F)%>% makeasChar %>% bind_rows()
#add accession
#read all files
flist<-list.files(full.names = TRUE,path = "idf/", pattern = "*.txt")
listFiles<-list()
k<-1
for(f in flist){
print(f)
#read f and store in list
thisF<-read_tsv(f,col_names = F)
#clean this F (transpose)
thisF<-cleanSdrf(thisF)
thisFname<-tools::file_path_sans_ext(tools::file_path_sans_ext(basename(f)))
#remove duplicate columns
colnames(thisF)<-make.unique(colnames(thisF))
#remove .idf
thisF <- thisF %>% mutate(Accession=thisFname)
listFiles[[k]]<-thisF
k<-k+1
}
#bind rows
df_idf<-bind_rows(listFiles)
write_tsv(df_idf,"idf_summary.tsv")
##read sdrf
#read all files
# sdrf from E-MTAB-1264 has duplicated rows for the CEL files it was edited and duplicates were removed
flistsdrf<-list.files(full.names = TRUE,path = "sdrf/", pattern = "*.txt")
listFiles<-list()
k<-1
for(f in flistsdrf){
print(f)
#read f and store in list
thisF<-read_tsv(f,col_names = T)
#thisFname<-tools::file_path_sans_ext(tools::file_path_sans_ext(basename(f)))
thisFname<-unlist(strsplit(basename(f), "." ,fixed = T))[1]
#remove duplicate columns
colnames(thisF)<-make.unique(colnames(thisF))
#remove .idf
thisF <- thisF %>% mutate(Accession=thisFname)
thisF <- thisF %>% mutate_all(as.character)
listFiles[[k]]<-thisF
k<-k+1
}
#bind rows
df_sdrf<-bind_rows(listFiles)
#rename col
colnames(df_sdrf)[which(colnames(df_sdrf)=="Array Data File")] <- "Array_Data_File"
df_sdrf[is.na(df_sdrf)]<-"NA"
#check all rows have Array Data File
df_sdrf$`Source Name`[which(df_sdrf$Array_Data_File=="NA")]
#extract useful info
keyWords<-c("organism","cell","disease","treatment","arrayexpress","Array_Data_File","Material Type","Protocol","Sample","Description","Characteristics")
colsToKeep<-grepl(paste(keyWords,collapse = "|"),colnames(df_sdrf),ignore.case = T)
colsToKeep[which(colnames(df_sdrf)=="Accession")]=TRUE
df_sdrf_final<-df_sdrf[,colsToKeep]
#order by col name
df_sdrf_final<-df_sdrf_final[,colnames(df_sdrf_final)[order(colnames(df_sdrf_final))]]
#remove all cols with NA
df_sdrf_final[df_sdrf_final=="NA"]<-NA
df_sdrf_final[df_sdrf_final==""]<-NA
df_sdrf_final2 <- df_sdrf_final[, colSums(is.na(df_sdrf_final)) < nrow(df_sdrf_final) ]
#merge columns by keywords
#init new df
test<-data.frame()
test<-as.data.frame(df_sdrf_final[,"Accession"])
df_sdrf_final[is.na(df_sdrf_final)]<-""
for(keyw in keyWords){
print(keyw)
thisColName<-paste(keyw,"Info",sep = "_")
thisCols<-colnames(df_sdrf_final)[grepl(keyw,colnames(df_sdrf_final),ignore.case = T)]
tempdf<-as.data.frame(df_sdrf_final[ , thisCols ])
test[,thisColName] <- apply( tempdf , 1 , function(row) paste(row[nzchar(row)], collapse = ";") )
}
#remove first column
#test<-test[,c(2:ncol(test))]
df_sdrf_final <- test %>% distinct
df_sdrf_final<- df_sdrf_final %>% mutate(DataColum = paste(Accession,Array_Data_File_Info,sep = "_"))
#duplicates
df_sdrf_final$DataColum[duplicated(df_sdrf_final$DataColum)]
#write to file
write_tsv(df_sdrf_final,"sdrf_summary.tsv")
combinedMD<- left_join(df_idf,df_sdrf_final)
combinedMD <- combinedMD %>% distinct
#datacolumns in data
Acc_to_Cel <- read_delim("Acc_to_Cel.txt",
"\t", escape_double = FALSE, trim_ws = TRUE)
combinedMD<-combinedMD %>% filter(DataColum %in% Acc_to_Cel$Combined)
combinedMD <- data.frame(lapply(combinedMD, as.character), stringsAsFactors=FALSE)
combinedMD[is.na(combinedMD)]<-"NA"
combinedMD[combinedMD==""]<-"NA"
combinedMD2 <- combinedMD %>% distinct
repeatedDC <- as.data.frame(combinedMD$DataColum[duplicated(combinedMD$DataColum)])
write_tsv(repeatedDC,"repeatedDC.tsv")
write_tsv(combinedMD,"combined_metadata.tsv")