diff --git a/NAMESPACE b/NAMESPACE index e84ce42..94b06a0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,9 +18,11 @@ importFrom(RCy3,mapVisualProperty) importFrom(RCy3,setVisualStyle) importFrom(grDevices,colorRamp) importFrom(grDevices,rgb) +importFrom(httr,GET) importFrom(httr,POST) importFrom(httr,add_headers) importFrom(httr,content) +importFrom(httr,status_code) importFrom(jsonlite,fromJSON) importFrom(jsonlite,toJSON) importFrom(r2r,hashmap) diff --git a/R/getSubnetworkFromIndra.R b/R/getSubnetworkFromIndra.R index d0d2f39..ab3e232 100644 --- a/R/getSubnetworkFromIndra.R +++ b/R/getSubnetworkFromIndra.R @@ -30,6 +30,9 @@ #' @param force_include_other character vector of identifiers to include in the #' network, regardless if those ids are in the input data. Should be formatted #' as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911". +#' @param filter_by_curation logical, whether to filter out statements that +#' have been curated as incorrect in INDRA. Default is FALSE. +#' @param api_key string of INDRA API key for accessing curated statements. #' #' @return list of 2 data.frames, nodes and edges #' @@ -53,11 +56,13 @@ getSubnetworkFromIndra <- function(input, correlation_cutoff = 0.3, sources_filter = NULL, logfc_cutoff = NULL, - force_include_other = NULL) { + force_include_other = NULL, + filter_by_curation = FALSE, + api_key = "") { input <- .filterGetSubnetworkFromIndraInput(input, pvalueCutoff, logfc_cutoff, force_include_other) .validateGetSubnetworkFromIndraInput(input, protein_level_data, sources_filter, force_include_other) res <- .callIndraCogexApi(input$HgncId, force_include_other) - res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter) + res <- .filterIndraResponse(res, statement_types, evidence_count_cutoff, sources_filter, filter_by_curation, api_key) edges <- .constructEdgesDataFrame(res, input, protein_level_data) edges <- .filterEdgesDataFrame(edges, paper_count_cutoff, correlation_cutoff) nodes <- .constructNodesDataFrame(input, edges) diff --git a/R/utils_getSubnetworkFromIndra.R b/R/utils_getSubnetworkFromIndra.R index 4d3071c..911c863 100644 --- a/R/utils_getSubnetworkFromIndra.R +++ b/R/utils_getSubnetworkFromIndra.R @@ -66,25 +66,53 @@ return(res) } +#' @importFrom httr GET status_code content +#' @importFrom jsonlite fromJSON +.get_incorrect_curation_count <- function(stmt_hash, api_key) { + stmt_hash_char <- as.character(stmt_hash) + url <- paste0("https://db.indra.bio/curation/list/", stmt_hash_char, "?api_key=", api_key) + + tryCatch({ + response <- GET(url) + if (status_code(response) == 200) { + curations <- fromJSON(content(response, "text", encoding = "UTF-8")) + if (length(curations) == 0) { + return(0) + } + incorrect_curations <- curations[curations$tag != "correct", ] + unique_incorrect <- length(unique(incorrect_curations$source_hash)) + + return(unique_incorrect) + } else { + warning(paste("API request failed for hash", stmt_hash_char, + "with status code", status_code(response))) + return(0) + } + }, error = function(e) { + warning(paste("Error processing hash", stmt_hash_char, ":", e$message)) + return(0) + }) +} + #' Call INDRA Cogex API and return response #' @param res response from INDRA #' @param interaction_types interaction types to filter by #' @param evidence_count_cutoff number of evidence to filter on for each paper #' @param sources_filter list of sources to filter by. Default is NULL, i.e. no filter +#' @param filter_by_curation logical, whether to filter out statements that +#' have been curated as incorrect in INDRA. Default is FALSE. +#' @param api_key string of INDRA API key for accessing curated statements. #' @return filtered list of INDRA statements #' @importFrom jsonlite fromJSON #' @keywords internal #' @noRd -.filterIndraResponse <- function(res, interaction_types, evidence_count_cutoff, sources_filter = NULL) { +.filterIndraResponse <- function(res, interaction_types, evidence_count_cutoff, + sources_filter = NULL, filter_by_curation = FALSE, api_key = "") { if (!is.null(interaction_types)) { res = Filter( function(statement) statement$data$stmt_type %in% interaction_types, res) } - res = Filter( - function(statement) statement$data$evidence_count >= evidence_count_cutoff, - res - ) if (!is.null(sources_filter)) { res = Filter( function(statement) { @@ -95,6 +123,19 @@ res ) } + if (filter_by_curation) { + for (i in seq_along(res)) { + stmt_hash <- res[[i]]$data$stmt_hash + incorrect_count <- .get_incorrect_curation_count(stmt_hash, api_key) + res[[i]]$data$evidence_count <- res[[i]]$data$evidence_count - incorrect_count + # Todo: Also subtract source_counts accordingly if requested + Sys.sleep(0.1) + } + } + res = Filter( + function(statement) statement$data$evidence_count >= evidence_count_cutoff, + res + ) return(res) } diff --git a/man/getSubnetworkFromIndra.Rd b/man/getSubnetworkFromIndra.Rd index ac8fa6f..94d27f9 100644 --- a/man/getSubnetworkFromIndra.Rd +++ b/man/getSubnetworkFromIndra.Rd @@ -14,7 +14,9 @@ getSubnetworkFromIndra( correlation_cutoff = 0.3, sources_filter = NULL, logfc_cutoff = NULL, - force_include_other = NULL + force_include_other = NULL, + filter_by_curation = FALSE, + api_key = "" ) } \arguments{ @@ -54,6 +56,11 @@ is NULL, i.e. no logFC filtering.} \item{force_include_other}{character vector of identifiers to include in the network, regardless if those ids are in the input data. Should be formatted as "namespace:identifier", e.g. "HGNC:1234" or "CHEBI:4911".} + +\item{filter_by_curation}{logical, whether to filter out statements that +have been curated as incorrect in INDRA. Default is FALSE.} + +\item{api_key}{string of INDRA API key for accessing curated statements.} } \value{ list of 2 data.frames, nodes and edges